generic: 6.6: replace (broken) downstream patch with upstream solution
authorDaniel Golle <daniel@makrotopia.org>
Thu, 30 May 2024 11:33:37 +0000 (12:33 +0100)
committerChristian Marangi <ansuelsmth@gmail.com>
Mon, 17 Jun 2024 11:31:20 +0000 (13:31 +0200)
Our downstream patch "net/core: add optional threading for backlog processing"
has been broken with the switch to Linux 6.6.
Replace it by backporting the now available upstream solution.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://github.com/openwrt/openwrt/pull/15592
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
target/linux/generic/backport-6.6/600-v6.10-net-Remove-conditional-threaded-NAPI-wakeup-based-on.patch [new file with mode: 0644]
target/linux/generic/backport-6.6/601-v6.10-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch [new file with mode: 0644]
target/linux/generic/backport-6.6/602-v6.10-net-Use-backlog-NAPI-to-clean-up-the-defer_list.patch [new file with mode: 0644]
target/linux/generic/backport-6.6/603-v6.10-net-Rename-rps_lock-to-backlog_lock.patch [new file with mode: 0644]
target/linux/generic/backport-6.6/770-net-introduce-napi_is_scheduled-helper.patch
target/linux/generic/hack-6.6/721-net-add-packet-mangeling.patch
target/linux/generic/pending-6.6/760-net-core-add-optional-threading-for-backlog-processi.patch [deleted file]

diff --git a/target/linux/generic/backport-6.6/600-v6.10-net-Remove-conditional-threaded-NAPI-wakeup-based-on.patch b/target/linux/generic/backport-6.6/600-v6.10-net-Remove-conditional-threaded-NAPI-wakeup-based-on.patch
new file mode 100644 (file)
index 0000000..ef7963b
--- /dev/null
@@ -0,0 +1,75 @@
+From 56364c910691f6d10ba88c964c9041b9ab777bd6 Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Mar 2024 08:40:28 +0100
+Subject: [PATCH 1/4] net: Remove conditional threaded-NAPI wakeup based on
+ task state.
+
+A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If
+successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED
+is set but only if thread's state is not TASK_INTERRUPTIBLE (is
+TASK_RUNNING) followed by task wakeup.
+
+If the task is idle (TASK_INTERRUPTIBLE) then the
+NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on
+the bit but always leaving the wait-loop after returning from schedule()
+because there must have been a wakeup.
+
+The smpboot-threads implementation for per-CPU threads requires an
+explicit condition and does not support "if we get out of schedule()
+then there must be something to do".
+
+Removing this optimisation simplifies the following integration.
+
+Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it
+in the wait path by removing the `woken' condition.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+---
+ net/core/dev.c | 14 ++------------
+ 1 file changed, 2 insertions(+), 12 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4473,13 +4473,7 @@ static inline void ____napi_schedule(str
+                */
+               thread = READ_ONCE(napi->thread);
+               if (thread) {
+-                      /* Avoid doing set_bit() if the thread is in
+-                       * INTERRUPTIBLE state, cause napi_thread_wait()
+-                       * makes sure to proceed with napi polling
+-                       * if the thread is explicitly woken from here.
+-                       */
+-                      if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
+-                              set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++                      set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+                       wake_up_process(thread);
+                       return;
+               }
+@@ -6635,8 +6629,6 @@ static int napi_poll(struct napi_struct
+ static int napi_thread_wait(struct napi_struct *napi)
+ {
+-      bool woken = false;
+-
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop()) {
+@@ -6645,15 +6637,13 @@ static int napi_thread_wait(struct napi_
+                * Testing SCHED bit is not enough because SCHED bit might be
+                * set by some other busy poll thread or by napi_disable().
+                */
+-              if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
++              if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
+                       WARN_ON(!list_empty(&napi->poll_list));
+                       __set_current_state(TASK_RUNNING);
+                       return 0;
+               }
+               schedule();
+-              /* woken being true indicates this thread owns this napi. */
+-              woken = true;
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+       __set_current_state(TASK_RUNNING);
diff --git a/target/linux/generic/backport-6.6/601-v6.10-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch b/target/linux/generic/backport-6.6/601-v6.10-net-Allow-to-use-SMP-threads-for-backlog-NAPI.patch
new file mode 100644 (file)
index 0000000..3a7962f
--- /dev/null
@@ -0,0 +1,330 @@
+From dad6b97702639fba27a2bd3e986982ad6f0db3a7 Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Mar 2024 08:40:29 +0100
+Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI.
+
+Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
+used by drivers which don't do NAPI them self, RPS and parts of the
+stack which need to avoid recursive deadlocks while processing a packet.
+
+The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
+then a flow for the skb is computed and based on the flow the skb can be
+enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
+NAPI) on the remote CPU isn't trivial because the softirq is only
+scheduled on the local CPU and performed after the hardirq is done.
+In order to schedule a softirq on the remote CPU, an IPI is sent to the
+remote CPU which schedules the backlog-NAPI on the then local CPU.
+
+On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
+raised within the interrupt thread and processed after the interrupt
+handler completed still within the context of the interrupt thread. The
+softirq is handled in the context where it originated.
+
+With force-threaded interrupts enabled, ksoftirqd is woken up if a
+softirq is raised from hardirq context. This is the case if it is raised
+from an IPI. Additionally there is a warning on PREEMPT_RT if the
+softirq is raised from the idle thread.
+This was done for two reasons:
+- With threaded interrupts the processing should happen in thread
+  context (where it originated) and ksoftirqd is the only thread for
+  this context if raised from hardirq. Using the currently running task
+  instead would "punish" a random task.
+- Once ksoftirqd is active it consumes all further softirqs until it
+  stops running. This changed recently and is no longer the case.
+
+Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
+PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
+The "proper" setup with threaded-NAPI is not doable because the threads
+are not pinned to an individual CPU and can be modified by the user.
+Additionally a dummy network device would have to be assigned. Also
+CPU-hotplug has to be considered if additional CPUs show up.
+All this can be probably done/ solved but the smpboot-threads already
+provide this infrastructure.
+
+Sending UDP packets over loopback expects that the packet is processed
+within the call. Delaying it by handing it over to the thread hurts
+performance. It is not beneficial to the outcome if the context switch
+happens immediately after enqueue or after a while to process a few
+packets in a batch.
+There is no need to always use the thread if the backlog NAPI is
+requested on the local CPU. This restores the loopback throuput. The
+performance drops mostly to the same value after enabling RPS on the
+loopback comparing the IPI and the tread result.
+
+Create NAPI-threads for backlog if request during boot. The thread runs
+the inner loop from napi_threaded_poll(), the wait part is different. It
+checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
+
+The NAPI threads for backlog are optional, it has to be enabled via the boot
+argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
+wakeup of ksoftirqd from the IPI.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+---
+ net/core/dev.c | 148 +++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 113 insertions(+), 35 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -78,6 +78,7 @@
+ #include <linux/slab.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
++#include <linux/smpboot.h>
+ #include <linux/mutex.h>
+ #include <linux/rwsem.h>
+ #include <linux/string.h>
+@@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind
+       return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
+ }
++#ifndef CONFIG_PREEMPT_RT
++
++static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
++
++static int __init setup_backlog_napi_threads(char *arg)
++{
++      static_branch_enable(&use_backlog_threads_key);
++      return 0;
++}
++early_param("thread_backlog_napi", setup_backlog_napi_threads);
++
++static bool use_backlog_threads(void)
++{
++      return static_branch_unlikely(&use_backlog_threads_key);
++}
++
++#else
++
++static bool use_backlog_threads(void)
++{
++      return true;
++}
++
++#endif
++
+ static inline void rps_lock_irqsave(struct softnet_data *sd,
+                                   unsigned long *flags)
+ {
+@@ -4441,6 +4467,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
+ /*************************************************************************
+  *                    Receiver routines
+  *************************************************************************/
++static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
+ int netdev_max_backlog __read_mostly = 1000;
+ EXPORT_SYMBOL(netdev_max_backlog);
+@@ -4473,12 +4500,16 @@ static inline void ____napi_schedule(str
+                */
+               thread = READ_ONCE(napi->thread);
+               if (thread) {
++                      if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
++                              goto use_local_napi;
++
+                       set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+                       wake_up_process(thread);
+                       return;
+               }
+       }
++use_local_napi:
+       list_add_tail(&napi->poll_list, &sd->poll_list);
+       WRITE_ONCE(napi->list_owner, smp_processor_id());
+       /* If not called from net_rx_action()
+@@ -4724,6 +4755,11 @@ static void napi_schedule_rps(struct sof
+ #ifdef CONFIG_RPS
+       if (sd != mysd) {
++              if (use_backlog_threads()) {
++                      __napi_schedule_irqoff(&sd->backlog);
++                      return;
++              }
++
+               sd->rps_ipi_next = mysd->rps_ipi_list;
+               mysd->rps_ipi_list = sd;
+@@ -5947,7 +5983,7 @@ static void net_rps_action_and_irq_enabl
+ #ifdef CONFIG_RPS
+       struct softnet_data *remsd = sd->rps_ipi_list;
+-      if (remsd) {
++      if (!use_backlog_threads() && remsd) {
+               sd->rps_ipi_list = NULL;
+               local_irq_enable();
+@@ -5962,7 +5998,7 @@ static void net_rps_action_and_irq_enabl
+ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
+ {
+ #ifdef CONFIG_RPS
+-      return sd->rps_ipi_list != NULL;
++      return !use_backlog_threads() && sd->rps_ipi_list;
+ #else
+       return false;
+ #endif
+@@ -6006,7 +6042,7 @@ static int process_backlog(struct napi_s
+                        * We can use a plain write instead of clear_bit(),
+                        * and we dont need an smp_mb() memory barrier.
+                        */
+-                      napi->state = 0;
++                      napi->state &= NAPIF_STATE_THREADED;
+                       again = false;
+               } else {
+                       skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6672,43 +6708,48 @@ static void skb_defer_free_flush(struct
+       }
+ }
+-static int napi_threaded_poll(void *data)
++static void napi_threaded_poll_loop(struct napi_struct *napi)
+ {
+-      struct napi_struct *napi = data;
+       struct softnet_data *sd;
+-      void *have;
++      unsigned long last_qs = jiffies;
+-      while (!napi_thread_wait(napi)) {
+-              unsigned long last_qs = jiffies;
++      for (;;) {
++              bool repoll = false;
++              void *have;
+-              for (;;) {
+-                      bool repoll = false;
++              local_bh_disable();
++              sd = this_cpu_ptr(&softnet_data);
++              sd->in_napi_threaded_poll = true;
+-                      local_bh_disable();
+-                      sd = this_cpu_ptr(&softnet_data);
+-                      sd->in_napi_threaded_poll = true;
+-
+-                      have = netpoll_poll_lock(napi);
+-                      __napi_poll(napi, &repoll);
+-                      netpoll_poll_unlock(have);
+-
+-                      sd->in_napi_threaded_poll = false;
+-                      barrier();
+-
+-                      if (sd_has_rps_ipi_waiting(sd)) {
+-                              local_irq_disable();
+-                              net_rps_action_and_irq_enable(sd);
+-                      }
+-                      skb_defer_free_flush(sd);
+-                      local_bh_enable();
++              have = netpoll_poll_lock(napi);
++              __napi_poll(napi, &repoll);
++              netpoll_poll_unlock(have);
++
++              sd->in_napi_threaded_poll = false;
++              barrier();
++
++              if (sd_has_rps_ipi_waiting(sd)) {
++                      local_irq_disable();
++                      net_rps_action_and_irq_enable(sd);
++              }
++              skb_defer_free_flush(sd);
++              local_bh_enable();
+-                      if (!repoll)
+-                              break;
++              if (!repoll)
++                      break;
+-                      rcu_softirq_qs_periodic(last_qs);
+-                      cond_resched();
+-              }
++              rcu_softirq_qs_periodic(last_qs);
++              cond_resched();
+       }
++}
++
++static int napi_threaded_poll(void *data)
++{
++      struct napi_struct *napi = data;
++
++      while (!napi_thread_wait(napi))
++              napi_threaded_poll_loop(napi);
++
+       return 0;
+ }
+@@ -11288,7 +11329,7 @@ static int dev_cpu_dead(unsigned int old
+               list_del_init(&napi->poll_list);
+               if (napi->poll == process_backlog)
+-                      napi->state = 0;
++                      napi->state &= NAPIF_STATE_THREADED;
+               else
+                       ____napi_schedule(sd, napi);
+       }
+@@ -11296,12 +11337,14 @@ static int dev_cpu_dead(unsigned int old
+       raise_softirq_irqoff(NET_TX_SOFTIRQ);
+       local_irq_enable();
++      if (!use_backlog_threads()) {
+ #ifdef CONFIG_RPS
+-      remsd = oldsd->rps_ipi_list;
+-      oldsd->rps_ipi_list = NULL;
++              remsd = oldsd->rps_ipi_list;
++              oldsd->rps_ipi_list = NULL;
+ #endif
+-      /* send out pending IPI's on offline CPU */
+-      net_rps_send_ipi(remsd);
++              /* send out pending IPI's on offline CPU */
++              net_rps_send_ipi(remsd);
++      }
+       /* Process offline CPU's input_pkt_queue */
+       while ((skb = __skb_dequeue(&oldsd->process_queue))) {
+@@ -11564,6 +11607,38 @@ static struct pernet_operations __net_in
+  *
+  */
++static int backlog_napi_should_run(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++      struct napi_struct *napi = &sd->backlog;
++
++      return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
++}
++
++static void run_backlog_napi(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++
++      napi_threaded_poll_loop(&sd->backlog);
++}
++
++static void backlog_napi_setup(unsigned int cpu)
++{
++      struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
++      struct napi_struct *napi = &sd->backlog;
++
++      napi->thread = this_cpu_read(backlog_napi);
++      set_bit(NAPI_STATE_THREADED, &napi->state);
++}
++
++static struct smp_hotplug_thread backlog_threads = {
++      .store                  = &backlog_napi,
++      .thread_should_run      = backlog_napi_should_run,
++      .thread_fn              = run_backlog_napi,
++      .thread_comm            = "backlog_napi/%u",
++      .setup                  = backlog_napi_setup,
++};
++
+ /*
+  *       This is called single threaded during boot, so no need
+  *       to take the rtnl semaphore.
+@@ -11614,7 +11689,10 @@ static int __init net_dev_init(void)
+               init_gro_hash(&sd->backlog);
+               sd->backlog.poll = process_backlog;
+               sd->backlog.weight = weight_p;
++              INIT_LIST_HEAD(&sd->backlog.poll_list);
+       }
++      if (use_backlog_threads())
++              smpboot_register_percpu_thread(&backlog_threads);
+       dev_boot_phase = 0;
diff --git a/target/linux/generic/backport-6.6/602-v6.10-net-Use-backlog-NAPI-to-clean-up-the-defer_list.patch b/target/linux/generic/backport-6.6/602-v6.10-net-Use-backlog-NAPI-to-clean-up-the-defer_list.patch
new file mode 100644 (file)
index 0000000..6a9c113
--- /dev/null
@@ -0,0 +1,121 @@
+From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Mar 2024 08:40:30 +0100
+Subject: [PATCH 3/4] net: Use backlog-NAPI to clean up the defer_list.
+
+The defer_list is a per-CPU list which is used to free skbs outside of
+the socket lock and on the CPU on which they have been allocated.
+The list is processed during NAPI callbacks so ideally the list is
+cleaned up.
+Should the amount of skbs on the list exceed a certain water mark then
+the softirq is triggered remotely on the target CPU by invoking a remote
+function call. The raise of the softirqs via a remote function call
+leads to waking the ksoftirqd on PREEMPT_RT which is undesired.
+The backlog-NAPI threads already provide the infrastructure which can be
+utilized to perform the cleanup of the defer_list.
+
+The NAPI state is updated with the input_pkt_queue.lock acquired. It
+order not to break the state, it is needed to also wake the backlog-NAPI
+thread with the lock held. This requires to acquire the use the lock in
+rps_lock_irq*() if the backlog-NAPI threads are used even with RPS
+disabled.
+
+Move the logic of remotely starting softirqs to clean up the defer_list
+into kick_defer_list_purge(). Make sure a lock is held in
+rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI
+for defer_list cleanup if backlog-NAPI is available.
+
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+---
+ include/linux/netdevice.h |  1 +
+ net/core/dev.c            | 25 +++++++++++++++++++++----
+ net/core/skbuff.c         |  4 ++--
+ 3 files changed, 24 insertions(+), 6 deletions(-)
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -3300,6 +3300,7 @@ static inline void dev_xmit_recursion_de
+       __this_cpu_dec(softnet_data.xmit.recursion);
+ }
++void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+ void __netif_schedule(struct Qdisc *q);
+ void netif_schedule_queue(struct netdev_queue *txq);
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -246,7 +246,7 @@ static bool use_backlog_threads(void)
+ static inline void rps_lock_irqsave(struct softnet_data *sd,
+                                   unsigned long *flags)
+ {
+-      if (IS_ENABLED(CONFIG_RPS))
++      if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_save(*flags);
+@@ -254,7 +254,7 @@ static inline void rps_lock_irqsave(stru
+ static inline void rps_lock_irq_disable(struct softnet_data *sd)
+ {
+-      if (IS_ENABLED(CONFIG_RPS))
++      if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_lock_irq(&sd->input_pkt_queue.lock);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_disable();
+@@ -263,7 +263,7 @@ static inline void rps_lock_irq_disable(
+ static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+                                         unsigned long *flags)
+ {
+-      if (IS_ENABLED(CONFIG_RPS))
++      if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_restore(*flags);
+@@ -271,7 +271,7 @@ static inline void rps_unlock_irq_restor
+ static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+ {
+-      if (IS_ENABLED(CONFIG_RPS))
++      if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_unlock_irq(&sd->input_pkt_queue.lock);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_enable();
+@@ -4774,6 +4774,23 @@ static void napi_schedule_rps(struct sof
+       __napi_schedule_irqoff(&mysd->backlog);
+ }
++void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
++{
++      unsigned long flags;
++
++      if (use_backlog_threads()) {
++              rps_lock_irqsave(sd, &flags);
++
++              if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
++                      __napi_schedule_irqoff(&sd->backlog);
++
++              rps_unlock_irq_restore(sd, &flags);
++
++      } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
++              smp_call_function_single_async(cpu, &sd->defer_csd);
++      }
++}
++
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+ #endif
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -6863,8 +6863,8 @@ nodefer: __kfree_skb(skb);
+       /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+        * if we are unlucky enough (this seems very unlikely).
+        */
+-      if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
+-              smp_call_function_single_async(cpu, &sd->defer_csd);
++      if (unlikely(kick))
++              kick_defer_list_purge(sd, cpu);
+ }
+ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
diff --git a/target/linux/generic/backport-6.6/603-v6.10-net-Rename-rps_lock-to-backlog_lock.patch b/target/linux/generic/backport-6.6/603-v6.10-net-Rename-rps_lock-to-backlog_lock.patch
new file mode 100644 (file)
index 0000000..8010672
--- /dev/null
@@ -0,0 +1,164 @@
+From 765b11f8f4e20b7433e4ba4a3e9106a0d59501ed Mon Sep 17 00:00:00 2001
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Date: Mon, 25 Mar 2024 08:40:31 +0100
+Subject: [PATCH 4/4] net: Rename rps_lock to backlog_lock.
+
+The rps_lock.*() functions use the inner lock of a sk_buff_head for
+locking. This lock is used if RPS is enabled, otherwise the list is
+accessed lockless and disabling interrupts is enough for the
+synchronisation because it is only accessed CPU local. Not only the list
+is protected but also the NAPI state protected.
+With the addition of backlog threads, the lock is also needed because of
+the cross CPU access even without RPS. The clean up of the defer_list
+list is also done via backlog threads (if enabled).
+
+It has been suggested to rename the locking function since it is no
+longer just RPS.
+
+Rename the rps_lock*() functions to backlog_lock*().
+
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Acked-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+---
+ net/core/dev.c | 34 +++++++++++++++++-----------------
+ 1 file changed, 17 insertions(+), 17 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -243,8 +243,8 @@ static bool use_backlog_threads(void)
+ #endif
+-static inline void rps_lock_irqsave(struct softnet_data *sd,
+-                                  unsigned long *flags)
++static inline void backlog_lock_irq_save(struct softnet_data *sd,
++                                       unsigned long *flags)
+ {
+       if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+@@ -252,7 +252,7 @@ static inline void rps_lock_irqsave(stru
+               local_irq_save(*flags);
+ }
+-static inline void rps_lock_irq_disable(struct softnet_data *sd)
++static inline void backlog_lock_irq_disable(struct softnet_data *sd)
+ {
+       if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_lock_irq(&sd->input_pkt_queue.lock);
+@@ -260,8 +260,8 @@ static inline void rps_lock_irq_disable(
+               local_irq_disable();
+ }
+-static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+-                                        unsigned long *flags)
++static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
++                                            unsigned long *flags)
+ {
+       if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+@@ -269,7 +269,7 @@ static inline void rps_unlock_irq_restor
+               local_irq_restore(*flags);
+ }
+-static inline void rps_unlock_irq_enable(struct softnet_data *sd)
++static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
+ {
+       if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+               spin_unlock_irq(&sd->input_pkt_queue.lock);
+@@ -4779,12 +4779,12 @@ void kick_defer_list_purge(struct softne
+       unsigned long flags;
+       if (use_backlog_threads()) {
+-              rps_lock_irqsave(sd, &flags);
++              backlog_lock_irq_save(sd, &flags);
+               if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+                       __napi_schedule_irqoff(&sd->backlog);
+-              rps_unlock_irq_restore(sd, &flags);
++              backlog_unlock_irq_restore(sd, &flags);
+       } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+               smp_call_function_single_async(cpu, &sd->defer_csd);
+@@ -4846,7 +4846,7 @@ static int enqueue_to_backlog(struct sk_
+       reason = SKB_DROP_REASON_NOT_SPECIFIED;
+       sd = &per_cpu(softnet_data, cpu);
+-      rps_lock_irqsave(sd, &flags);
++      backlog_lock_irq_save(sd, &flags);
+       if (!netif_running(skb->dev))
+               goto drop;
+       qlen = skb_queue_len(&sd->input_pkt_queue);
+@@ -4855,7 +4855,7 @@ static int enqueue_to_backlog(struct sk_
+ enqueue:
+                       __skb_queue_tail(&sd->input_pkt_queue, skb);
+                       input_queue_tail_incr_save(sd, qtail);
+-                      rps_unlock_irq_restore(sd, &flags);
++                      backlog_unlock_irq_restore(sd, &flags);
+                       return NET_RX_SUCCESS;
+               }
+@@ -4870,7 +4870,7 @@ enqueue:
+ drop:
+       sd->dropped++;
+-      rps_unlock_irq_restore(sd, &flags);
++      backlog_unlock_irq_restore(sd, &flags);
+       dev_core_stats_rx_dropped_inc(skb->dev);
+       kfree_skb_reason(skb, reason);
+@@ -5901,7 +5901,7 @@ static void flush_backlog(struct work_st
+       local_bh_disable();
+       sd = this_cpu_ptr(&softnet_data);
+-      rps_lock_irq_disable(sd);
++      backlog_lock_irq_disable(sd);
+       skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+               if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+                       __skb_unlink(skb, &sd->input_pkt_queue);
+@@ -5909,7 +5909,7 @@ static void flush_backlog(struct work_st
+                       input_queue_head_incr(sd);
+               }
+       }
+-      rps_unlock_irq_enable(sd);
++      backlog_unlock_irq_enable(sd);
+       skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+               if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+@@ -5927,14 +5927,14 @@ static bool flush_required(int cpu)
+       struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+       bool do_flush;
+-      rps_lock_irq_disable(sd);
++      backlog_lock_irq_disable(sd);
+       /* as insertion into process_queue happens with the rps lock held,
+        * process_queue access may race only with dequeue
+        */
+       do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+                  !skb_queue_empty_lockless(&sd->process_queue);
+-      rps_unlock_irq_enable(sd);
++      backlog_unlock_irq_enable(sd);
+       return do_flush;
+ #endif
+@@ -6049,7 +6049,7 @@ static int process_backlog(struct napi_s
+               }
+-              rps_lock_irq_disable(sd);
++              backlog_lock_irq_disable(sd);
+               if (skb_queue_empty(&sd->input_pkt_queue)) {
+                       /*
+                        * Inline a custom version of __napi_complete().
+@@ -6065,7 +6065,7 @@ static int process_backlog(struct napi_s
+                       skb_queue_splice_tail_init(&sd->input_pkt_queue,
+                                                  &sd->process_queue);
+               }
+-              rps_unlock_irq_enable(sd);
++              backlog_unlock_irq_enable(sd);
+       }
+       return work;
index 821fd60a2d2c975230425774d792cbdc75215081..6449cd6a3a19e176b5cd36cb6450785e46b0c976 100644 (file)
@@ -85,7 +85,7 @@ Signed-off-by: Paolo Abeni <pabeni@redhat.com>
  /**
 --- a/net/core/dev.c
 +++ b/net/core/dev.c
-@@ -6555,7 +6555,7 @@ static int __napi_poll(struct napi_struc
+@@ -6602,7 +6602,7 @@ static int __napi_poll(struct napi_struc
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
index b51a324027f659d5ea6a3b1153390ab92efaab51..e1d4367a8f54936f3aa47c775bf1c1b0216259fe 100644 (file)
@@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 
 --- a/include/linux/netdevice.h
 +++ b/include/linux/netdevice.h
-@@ -1759,6 +1759,7 @@ enum netdev_priv_flags {
+@@ -1758,6 +1758,7 @@ enum netdev_priv_flags {
        IFF_TX_SKB_NO_LINEAR            = BIT_ULL(31),
        IFF_CHANGE_PROTO_DOWN           = BIT_ULL(32),
        IFF_SEE_ALL_HWTSTAMP_REQUESTS   = BIT_ULL(33),
@@ -27,7 +27,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
  };
  
  #define IFF_802_1Q_VLAN                       IFF_802_1Q_VLAN
-@@ -1792,6 +1793,7 @@ enum netdev_priv_flags {
+@@ -1791,6 +1792,7 @@ enum netdev_priv_flags {
  #define IFF_FAILOVER_SLAVE            IFF_FAILOVER_SLAVE
  #define IFF_L3MDEV_RX_HANDLER         IFF_L3MDEV_RX_HANDLER
  #define IFF_TX_SKB_NO_LINEAR          IFF_TX_SKB_NO_LINEAR
@@ -35,7 +35,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
  
  /* Specifies the type of the struct net_device::ml_priv pointer */
  enum netdev_ml_priv_type {
-@@ -2184,6 +2186,11 @@ struct net_device {
+@@ -2183,6 +2185,11 @@ struct net_device {
        const struct tlsdev_ops *tlsdev_ops;
  #endif
  
@@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
        const struct header_ops *header_ops;
  
        unsigned char           operstate;
-@@ -2257,6 +2264,10 @@ struct net_device {
+@@ -2256,6 +2263,10 @@ struct net_device {
        struct mctp_dev __rcu   *mctp_ptr;
  #endif
  
@@ -105,7 +105,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
        help
 --- a/net/core/dev.c
 +++ b/net/core/dev.c
-@@ -3571,6 +3571,11 @@ static int xmit_one(struct sk_buff *skb,
+@@ -3597,6 +3597,11 @@ static int xmit_one(struct sk_buff *skb,
        if (dev_nit_active(dev))
                dev_queue_xmit_nit(skb, dev);
  
diff --git a/target/linux/generic/pending-6.6/760-net-core-add-optional-threading-for-backlog-processi.patch b/target/linux/generic/pending-6.6/760-net-core-add-optional-threading-for-backlog-processi.patch
deleted file mode 100644 (file)
index 8a9066b..0000000
+++ /dev/null
@@ -1,227 +0,0 @@
-From: Felix Fietkau <nbd@nbd.name>
-Date: Thu, 16 Feb 2023 18:39:04 +0100
-Subject: [PATCH] net/core: add optional threading for backlog processing
-
-When dealing with few flows or an imbalance on CPU utilization, static RPS
-CPU assignment can be too inflexible. Add support for enabling threaded NAPI
-for backlog processing in order to allow the scheduler to better balance
-processing. This helps better spread the load across idle CPUs.
-
-Signed-off-by: Felix Fietkau <nbd@nbd.name>
----
-
---- a/include/linux/netdevice.h
-+++ b/include/linux/netdevice.h
-@@ -558,6 +558,7 @@ static inline bool napi_complete(struct
- }
- int dev_set_threaded(struct net_device *dev, bool threaded);
-+int backlog_set_threaded(bool threaded);
- /**
-  *    napi_disable - prevent NAPI from scheduling
-@@ -3236,6 +3237,7 @@ struct softnet_data {
-       /* stats */
-       unsigned int            processed;
-       unsigned int            time_squeeze;
-+      unsigned int            process_queue_empty;
- #ifdef CONFIG_RPS
-       struct softnet_data     *rps_ipi_list;
- #endif
---- a/net/core/dev.c
-+++ b/net/core/dev.c
-@@ -4729,7 +4729,7 @@ static void napi_schedule_rps(struct sof
-       struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
- #ifdef CONFIG_RPS
--      if (sd != mysd) {
-+      if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
-               sd->rps_ipi_next = mysd->rps_ipi_list;
-               mysd->rps_ipi_list = sd;
-@@ -5848,6 +5848,8 @@ static DEFINE_PER_CPU(struct work_struct
- /* Network device is going away, flush any packets still pending */
- static void flush_backlog(struct work_struct *work)
- {
-+      unsigned int process_queue_empty;
-+      bool threaded, flush_processq;
-       struct sk_buff *skb, *tmp;
-       struct softnet_data *sd;
-@@ -5862,8 +5864,17 @@ static void flush_backlog(struct work_st
-                       input_queue_head_incr(sd);
-               }
-       }
-+
-+      threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
-+      flush_processq = threaded &&
-+                       !skb_queue_empty_lockless(&sd->process_queue);
-+      if (flush_processq)
-+              process_queue_empty = sd->process_queue_empty;
-       rps_unlock_irq_enable(sd);
-+      if (threaded)
-+              goto out;
-+
-       skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
-               if (skb->dev->reg_state == NETREG_UNREGISTERING) {
-                       __skb_unlink(skb, &sd->process_queue);
-@@ -5871,7 +5882,16 @@ static void flush_backlog(struct work_st
-                       input_queue_head_incr(sd);
-               }
-       }
-+
-+out:
-       local_bh_enable();
-+
-+      while (flush_processq) {
-+              msleep(1);
-+              rps_lock_irq_disable(sd);
-+              flush_processq = process_queue_empty == sd->process_queue_empty;
-+              rps_unlock_irq_enable(sd);
-+      }
- }
- static bool flush_required(int cpu)
-@@ -6003,6 +6023,7 @@ static int process_backlog(struct napi_s
-               }
-               rps_lock_irq_disable(sd);
-+              sd->process_queue_empty++;
-               if (skb_queue_empty(&sd->input_pkt_queue)) {
-                       /*
-                        * Inline a custom version of __napi_complete().
-@@ -6012,7 +6033,8 @@ static int process_backlog(struct napi_s
-                        * We can use a plain write instead of clear_bit(),
-                        * and we dont need an smp_mb() memory barrier.
-                        */
--                      napi->state = 0;
-+                      napi->state &= ~(NAPIF_STATE_SCHED |
-+                                       NAPIF_STATE_SCHED_THREADED);
-                       again = false;
-               } else {
-                       skb_queue_splice_tail_init(&sd->input_pkt_queue,
-@@ -6426,6 +6448,55 @@ int dev_set_threaded(struct net_device *
- }
- EXPORT_SYMBOL(dev_set_threaded);
-+int backlog_set_threaded(bool threaded)
-+{
-+      static bool backlog_threaded;
-+      int err = 0;
-+      int i;
-+
-+      if (backlog_threaded == threaded)
-+              return 0;
-+
-+      for_each_possible_cpu(i) {
-+              struct softnet_data *sd = &per_cpu(softnet_data, i);
-+              struct napi_struct *n = &sd->backlog;
-+
-+              if (n->thread)
-+                      continue;
-+              n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
-+              if (IS_ERR(n->thread)) {
-+                      err = PTR_ERR(n->thread);
-+                      pr_err("kthread_run failed with err %d\n", err);
-+                      n->thread = NULL;
-+                      threaded = false;
-+                      break;
-+              }
-+
-+      }
-+
-+      backlog_threaded = threaded;
-+
-+      /* Make sure kthread is created before THREADED bit
-+       * is set.
-+       */
-+      smp_mb__before_atomic();
-+
-+      for_each_possible_cpu(i) {
-+              struct softnet_data *sd = &per_cpu(softnet_data, i);
-+              struct napi_struct *n = &sd->backlog;
-+              unsigned long flags;
-+
-+              rps_lock_irqsave(sd, &flags);
-+              if (threaded)
-+                      n->state |= NAPIF_STATE_THREADED;
-+              else
-+                      n->state &= ~NAPIF_STATE_THREADED;
-+              rps_unlock_irq_restore(sd, &flags);
-+      }
-+
-+      return err;
-+}
-+
- void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
-                          int (*poll)(struct napi_struct *, int), int weight)
- {
-@@ -11307,6 +11378,9 @@ static int dev_cpu_dead(unsigned int old
-       raise_softirq_irqoff(NET_TX_SOFTIRQ);
-       local_irq_enable();
-+      if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
-+              return 0;
-+
- #ifdef CONFIG_RPS
-       remsd = oldsd->rps_ipi_list;
-       oldsd->rps_ipi_list = NULL;
-@@ -11622,6 +11696,7 @@ static int __init net_dev_init(void)
-               INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
-               spin_lock_init(&sd->defer_lock);
-+              INIT_LIST_HEAD(&sd->backlog.poll_list);
-               init_gro_hash(&sd->backlog);
-               sd->backlog.poll = process_backlog;
-               sd->backlog.weight = weight_p;
---- a/net/core/sysctl_net_core.c
-+++ b/net/core/sysctl_net_core.c
-@@ -30,6 +30,7 @@ static int int_3600 = 3600;
- static int min_sndbuf = SOCK_MIN_SNDBUF;
- static int min_rcvbuf = SOCK_MIN_RCVBUF;
- static int max_skb_frags = MAX_SKB_FRAGS;
-+static int backlog_threaded;
- static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
- static int net_msg_warn;      /* Unused, but still a sysctl */
-@@ -189,6 +190,23 @@ static int rps_sock_flow_sysctl(struct c
- }
- #endif /* CONFIG_RPS */
-+static int backlog_threaded_sysctl(struct ctl_table *table, int write,
-+                             void *buffer, size_t *lenp, loff_t *ppos)
-+{
-+      static DEFINE_MUTEX(backlog_threaded_mutex);
-+      int ret;
-+
-+      mutex_lock(&backlog_threaded_mutex);
-+
-+      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-+      if (write && !ret)
-+              ret = backlog_set_threaded(backlog_threaded);
-+
-+      mutex_unlock(&backlog_threaded_mutex);
-+
-+      return ret;
-+}
-+
- #ifdef CONFIG_NET_FLOW_LIMIT
- static DEFINE_MUTEX(flow_limit_update_mutex);
-@@ -541,6 +559,15 @@ static struct ctl_table net_core_table[]
-               .proc_handler   = rps_sock_flow_sysctl
-       },
- #endif
-+      {
-+              .procname       = "backlog_threaded",
-+              .data           = &backlog_threaded,
-+              .maxlen         = sizeof(unsigned int),
-+              .mode           = 0644,
-+              .proc_handler   = backlog_threaded_sysctl,
-+              .extra1         = SYSCTL_ZERO,
-+              .extra2         = SYSCTL_ONE
-+      },
- #ifdef CONFIG_NET_FLOW_LIMIT
-       {
-               .procname       = "flow_limit_cpu_bitmap",