From 4cf1394f51ba4b28edab016e36fb0f7a1c208f30 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 16 Mar 2019 19:50:44 +0100 Subject: [PATCH] mac80211: add a few performance improvement patches Signed-off-by: Felix Fietkau --- ...op-redundant-rcu_read_lock-unlock-ca.patch | 96 ++++++++++++++ ...te-hash-for-fq-without-holding-fq-lo.patch | 124 ++++++++++++++++++ ...e-dequeue-late-tx-handlers-without-h.patch | 55 ++++++++ ...IF_F_LLTX-when-using-intermediate-tx.patch | 22 ++++ 4 files changed, 297 insertions(+) create mode 100644 package/kernel/mac80211/patches/subsys/353-mac80211-mesh-drop-redundant-rcu_read_lock-unlock-ca.patch create mode 100644 package/kernel/mac80211/patches/subsys/354-mac80211-calculate-hash-for-fq-without-holding-fq-lo.patch create mode 100644 package/kernel/mac80211/patches/subsys/355-mac80211-run-late-dequeue-late-tx-handlers-without-h.patch create mode 100644 package/kernel/mac80211/patches/subsys/356-mac80211-set-NETIF_F_LLTX-when-using-intermediate-tx.patch diff --git a/package/kernel/mac80211/patches/subsys/353-mac80211-mesh-drop-redundant-rcu_read_lock-unlock-ca.patch b/package/kernel/mac80211/patches/subsys/353-mac80211-mesh-drop-redundant-rcu_read_lock-unlock-ca.patch new file mode 100644 index 000000000000..ae5be18170a5 --- /dev/null +++ b/package/kernel/mac80211/patches/subsys/353-mac80211-mesh-drop-redundant-rcu_read_lock-unlock-ca.patch @@ -0,0 +1,96 @@ +From: Felix Fietkau +Date: Sat, 16 Mar 2019 17:43:58 +0100 +Subject: [PATCH] mac80211: mesh: drop redundant rcu_read_lock/unlock calls + +The callers of these functions are all within RCU locked sections + +Signed-off-by: Felix Fietkau +--- + +--- a/net/mac80211/mesh_hwmp.c ++++ b/net/mac80211/mesh_hwmp.c +@@ -1112,16 +1112,13 @@ int mesh_nexthop_resolve(struct ieee8021 + struct mesh_path *mpath; + struct sk_buff *skb_to_free = NULL; + u8 *target_addr = hdr->addr3; +- int err = 0; + + /* Nulls are only sent to peers for PS and should be pre-addressed */ + if (ieee80211_is_qos_nullfunc(hdr->frame_control)) + return 0; + +- rcu_read_lock(); +- err = mesh_nexthop_lookup(sdata, skb); +- if (!err) +- goto endlookup; ++ if (!mesh_nexthop_lookup(sdata, skb)) ++ return 0; + + /* no nexthop found, start resolving */ + mpath = mesh_path_lookup(sdata, target_addr); +@@ -1129,8 +1126,7 @@ int mesh_nexthop_resolve(struct ieee8021 + mpath = mesh_path_add(sdata, target_addr); + if (IS_ERR(mpath)) { + mesh_path_discard_frame(sdata, skb); +- err = PTR_ERR(mpath); +- goto endlookup; ++ return PTR_ERR(mpath); + } + } + +@@ -1143,13 +1139,10 @@ int mesh_nexthop_resolve(struct ieee8021 + info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING; + ieee80211_set_qos_hdr(sdata, skb); + skb_queue_tail(&mpath->frame_queue, skb); +- err = -ENOENT; + if (skb_to_free) + mesh_path_discard_frame(sdata, skb_to_free); + +-endlookup: +- rcu_read_unlock(); +- return err; ++ return -ENOENT; + } + + /** +@@ -1169,13 +1162,10 @@ int mesh_nexthop_lookup(struct ieee80211 + struct sta_info *next_hop; + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + u8 *target_addr = hdr->addr3; +- int err = -ENOENT; + +- rcu_read_lock(); + mpath = mesh_path_lookup(sdata, target_addr); +- + if (!mpath || !(mpath->flags & MESH_PATH_ACTIVE)) +- goto endlookup; ++ return -ENOENT; + + if (time_after(jiffies, + mpath->exp_time - +@@ -1190,12 +1180,10 @@ int mesh_nexthop_lookup(struct ieee80211 + memcpy(hdr->addr1, next_hop->sta.addr, ETH_ALEN); + memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN); + ieee80211_mps_set_frame_flags(sdata, next_hop, hdr); +- err = 0; ++ return 0; + } + +-endlookup: +- rcu_read_unlock(); +- return err; ++ return -ENOENT; + } + + void mesh_path_timer(struct timer_list *t) +--- a/net/mac80211/mesh_pathtbl.c ++++ b/net/mac80211/mesh_pathtbl.c +@@ -217,7 +217,7 @@ static struct mesh_path *mpath_lookup(st + { + struct mesh_path *mpath; + +- mpath = rhashtable_lookup_fast(&tbl->rhead, dst, mesh_rht_params); ++ mpath = rhashtable_lookup(&tbl->rhead, dst, mesh_rht_params); + + if (mpath && mpath_expired(mpath)) { + spin_lock_bh(&mpath->state_lock); diff --git a/package/kernel/mac80211/patches/subsys/354-mac80211-calculate-hash-for-fq-without-holding-fq-lo.patch b/package/kernel/mac80211/patches/subsys/354-mac80211-calculate-hash-for-fq-without-holding-fq-lo.patch new file mode 100644 index 000000000000..2b6d8ab52525 --- /dev/null +++ b/package/kernel/mac80211/patches/subsys/354-mac80211-calculate-hash-for-fq-without-holding-fq-lo.patch @@ -0,0 +1,124 @@ +From: Felix Fietkau +Date: Sat, 16 Mar 2019 17:57:38 +0100 +Subject: [PATCH] mac80211: calculate hash for fq without holding fq->lock + in itxq enqueue + +Reduces lock contention on enqueue/dequeue of iTXQ packets + +Signed-off-by: Felix Fietkau +--- + +--- a/include/net/fq_impl.h ++++ b/include/net/fq_impl.h +@@ -107,21 +107,23 @@ begin: + return skb; + } + ++static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) ++{ ++ u32 hash = skb_get_hash_perturb(skb, fq->perturbation); ++ ++ return reciprocal_scale(hash, fq->flows_cnt); ++} ++ + static struct fq_flow *fq_flow_classify(struct fq *fq, +- struct fq_tin *tin, ++ struct fq_tin *tin, u32 idx, + struct sk_buff *skb, + fq_flow_get_default_t get_default_func) + { + struct fq_flow *flow; +- u32 hash; +- u32 idx; + + lockdep_assert_held(&fq->lock); + +- hash = skb_get_hash_perturb(skb, fq->perturbation); +- idx = reciprocal_scale(hash, fq->flows_cnt); + flow = &fq->flows[idx]; +- + if (flow->tin && flow->tin != tin) { + flow = get_default_func(fq, tin, idx, skb); + tin->collisions++; +@@ -153,7 +155,7 @@ static void fq_recalc_backlog(struct fq + } + + static void fq_tin_enqueue(struct fq *fq, +- struct fq_tin *tin, ++ struct fq_tin *tin, u32 idx, + struct sk_buff *skb, + fq_skb_free_t free_func, + fq_flow_get_default_t get_default_func) +@@ -163,7 +165,7 @@ static void fq_tin_enqueue(struct fq *fq + + lockdep_assert_held(&fq->lock); + +- flow = fq_flow_classify(fq, tin, skb, get_default_func); ++ flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); + + flow->tin = tin; + flow->backlog += skb->len; +--- a/net/mac80211/tx.c ++++ b/net/mac80211/tx.c +@@ -1390,11 +1390,15 @@ static void ieee80211_txq_enqueue(struct + { + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; ++ u32 flow_idx = fq_flow_idx(fq, skb); + + ieee80211_set_skb_enqueue_time(skb); +- fq_tin_enqueue(fq, tin, skb, ++ ++ spin_lock_bh(&fq->lock); ++ fq_tin_enqueue(fq, tin, flow_idx, skb, + fq_skb_free_func, + fq_flow_get_default_func); ++ spin_unlock_bh(&fq->lock); + } + + static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, +@@ -1564,7 +1568,6 @@ static bool ieee80211_queue_skb(struct i + struct sta_info *sta, + struct sk_buff *skb) + { +- struct fq *fq = &local->fq; + struct ieee80211_vif *vif; + struct txq_info *txqi; + +@@ -1582,9 +1585,7 @@ static bool ieee80211_queue_skb(struct i + if (!txqi) + return false; + +- spin_lock_bh(&fq->lock); + ieee80211_txq_enqueue(local, txqi, skb); +- spin_unlock_bh(&fq->lock); + + schedule_and_wake_txq(local, txqi); + +@@ -3198,6 +3199,7 @@ static bool ieee80211_amsdu_aggregate(st + u8 max_subframes = sta->sta.max_amsdu_subframes; + int max_frags = local->hw.max_tx_fragments; + int max_amsdu_len = sta->sta.max_amsdu_len; ++ u32 flow_idx; + int orig_truesize; + __be16 len; + void *data; +@@ -3220,6 +3222,8 @@ static bool ieee80211_amsdu_aggregate(st + max_amsdu_len = min_t(int, max_amsdu_len, + sta->sta.max_rc_amsdu_len); + ++ flow_idx = fq_flow_idx(fq, skb); ++ + spin_lock_bh(&fq->lock); + + /* TODO: Ideally aggregation should be done on dequeue to remain +@@ -3227,7 +3231,8 @@ static bool ieee80211_amsdu_aggregate(st + */ + + tin = &txqi->tin; +- flow = fq_flow_classify(fq, tin, skb, fq_flow_get_default_func); ++ flow = fq_flow_classify(fq, tin, flow_idx, skb, ++ fq_flow_get_default_func); + head = skb_peek_tail(&flow->queue); + if (!head) + goto unlock; diff --git a/package/kernel/mac80211/patches/subsys/355-mac80211-run-late-dequeue-late-tx-handlers-without-h.patch b/package/kernel/mac80211/patches/subsys/355-mac80211-run-late-dequeue-late-tx-handlers-without-h.patch new file mode 100644 index 000000000000..3127c8682278 --- /dev/null +++ b/package/kernel/mac80211/patches/subsys/355-mac80211-run-late-dequeue-late-tx-handlers-without-h.patch @@ -0,0 +1,55 @@ +From: Felix Fietkau +Date: Sat, 16 Mar 2019 18:00:12 +0100 +Subject: [PATCH] mac80211: run late dequeue late tx handlers without + holding fq->lock + +Reduces lock contention on enqueue/dequeue of iTXQ packets + +Signed-off-by: Felix Fietkau +--- + +--- a/net/mac80211/tx.c ++++ b/net/mac80211/tx.c +@@ -3507,6 +3507,7 @@ struct sk_buff *ieee80211_tx_dequeue(str + ieee80211_tx_result r; + struct ieee80211_vif *vif = txq->vif; + ++begin: + spin_lock_bh(&fq->lock); + + if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags) || +@@ -3523,11 +3524,12 @@ struct sk_buff *ieee80211_tx_dequeue(str + if (skb) + goto out; + +-begin: + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); + if (!skb) + goto out; + ++ spin_unlock_bh(&fq->lock); ++ + hdr = (struct ieee80211_hdr *)skb->data; + info = IEEE80211_SKB_CB(skb); + +@@ -3573,8 +3575,11 @@ begin: + + skb = __skb_dequeue(&tx.skbs); + +- if (!skb_queue_empty(&tx.skbs)) ++ if (!skb_queue_empty(&tx.skbs)) { ++ spin_lock_bh(&fq->lock); + skb_queue_splice_tail(&tx.skbs, &txqi->frags); ++ spin_unlock_bh(&fq->lock); ++ } + } + + if (skb && skb_has_frag_list(skb) && +@@ -3613,6 +3618,7 @@ begin: + } + + IEEE80211_SKB_CB(skb)->control.vif = vif; ++ return skb; + + out: + spin_unlock_bh(&fq->lock); diff --git a/package/kernel/mac80211/patches/subsys/356-mac80211-set-NETIF_F_LLTX-when-using-intermediate-tx.patch b/package/kernel/mac80211/patches/subsys/356-mac80211-set-NETIF_F_LLTX-when-using-intermediate-tx.patch new file mode 100644 index 000000000000..95ab3ab9fbd9 --- /dev/null +++ b/package/kernel/mac80211/patches/subsys/356-mac80211-set-NETIF_F_LLTX-when-using-intermediate-tx.patch @@ -0,0 +1,22 @@ +From: Felix Fietkau +Date: Sat, 16 Mar 2019 18:01:53 +0100 +Subject: [PATCH] mac80211: set NETIF_F_LLTX when using intermediate tx + queues + +When using iTXQ, tx sequence number allocation and statistics are run at +dequeue time. Because of that, it is safe to enable NETIF_F_LLTX, which +allows tx handlers to run on multiple CPUs in parallel. + +Signed-off-by: Felix Fietkau +--- + +--- a/net/mac80211/iface.c ++++ b/net/mac80211/iface.c +@@ -1301,6 +1301,7 @@ static void ieee80211_if_setup(struct ne + static void ieee80211_if_setup_no_queue(struct net_device *dev) + { + ieee80211_if_setup(dev); ++ dev->features |= NETIF_F_LLTX; + #if LINUX_VERSION_IS_GEQ(4,3,0) + dev->priv_flags |= IFF_NO_QUEUE; + #else -- 2.30.2