--- /dev/null
+From 78ed0a9bc6db76f8e5f5f4cb0d2b2f0d1bb21b24 Mon Sep 17 00:00:00 2001
+From: Roi Dayan <roid@nvidia.com>
+Date: Tue, 13 Apr 2021 11:06:05 +0300
+Subject: [PATCH] netfilter: flowtable: Add FLOW_OFFLOAD_XMIT_UNSPEC xmit type
+
+It could be xmit type was not set and would default to FLOW_OFFLOAD_XMIT_NEIGH
+and in this type the gc expect to have a route info.
+Fix that by adding FLOW_OFFLOAD_XMIT_UNSPEC which defaults to 0.
+
+Fixes: 8b9229d15877 ("netfilter: flowtable: dst_check() from garbage collector path")
+Signed-off-by: Roi Dayan <roid@nvidia.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+---
+ include/net/netfilter/nf_flow_table.h | 3 ++-
+ net/netfilter/nf_flow_table_core.c | 3 +++
+ 2 files changed, 5 insertions(+), 1 deletion(-)
+
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -90,7 +90,8 @@ enum flow_offload_tuple_dir {
+ #define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX
+
+ enum flow_offload_xmit_type {
+- FLOW_OFFLOAD_XMIT_NEIGH = 0,
++ FLOW_OFFLOAD_XMIT_UNSPEC = 0,
++ FLOW_OFFLOAD_XMIT_NEIGH,
+ FLOW_OFFLOAD_XMIT_XFRM,
+ FLOW_OFFLOAD_XMIT_DIRECT,
+ };
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -130,6 +130,9 @@ static int flow_offload_fill_route(struc
+ flow_tuple->dst_cache = dst;
+ flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
+ break;
++ default:
++ WARN_ON_ONCE(1);
++ break;
+ }
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
+
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
-@@ -328,7 +328,11 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
+@@ -331,7 +331,11 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
void flow_offload_refresh(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
-@@ -433,33 +433,12 @@ nf_flow_table_iterate(struct nf_flowtabl
+@@ -436,33 +436,12 @@ nf_flow_table_iterate(struct nf_flowtabl
return err;
}
--- /dev/null
+From 92fb15513edc6ae1eb51f717e70d4d3d538c2d09 Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Mon, 19 Jul 2021 18:04:01 +0200
+Subject: [PATCH] netfilter: flowtable: remove nf_ct_l4proto_find() call
+
+TCP and UDP are built-in conntrack protocol trackers and the flowtable
+only supports for TCP and UDP, remove this call.
+
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+---
+ net/netfilter/nf_flow_table_core.c | 10 ----------
+ 1 file changed, 10 deletions(-)
+
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -180,15 +180,10 @@ static void flow_offload_fixup_tcp(struc
+
+ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
+ {
+- const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
+ int l4num = nf_ct_protonum(ct);
+ unsigned int timeout;
+
+- l4proto = nf_ct_l4proto_find(l4num);
+- if (!l4proto)
+- return;
+-
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+@@ -273,15 +268,10 @@ static const struct rhashtable_params nf
+
+ unsigned long flow_offload_get_timeout(struct flow_offload *flow)
+ {
+- const struct nf_conntrack_l4proto *l4proto;
+ unsigned long timeout = NF_FLOW_TIMEOUT;
+ struct net *net = nf_ct_net(flow->ct);
+ int l4num = nf_ct_protonum(flow->ct);
+
+- l4proto = nf_ct_l4proto_find(l4num);
+- if (!l4proto)
+- return timeout;
+-
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
--- /dev/null
+From 4592ee7f525c4683ec9e290381601fdee50ae110 Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Wed, 4 Aug 2021 15:02:15 +0200
+Subject: [PATCH] netfilter: conntrack: remove offload_pickup sysctl again
+
+These two sysctls were added because the hardcoded defaults (2 minutes,
+tcp, 30 seconds, udp) turned out to be too low for some setups.
+
+They appeared in 5.14-rc1 so it should be fine to remove it again.
+
+Marcelo convinced me that there should be no difference between a flow
+that was offloaded vs. a flow that was not wrt. timeout handling.
+Thus the default is changed to those for TCP established and UDP stream,
+5 days and 120 seconds, respectively.
+
+Marcelo also suggested to account for the timeout value used for the
+offloading, this avoids increase beyond the value in the conntrack-sysctl
+and will also instantly expire the conntrack entry with altered sysctls.
+
+Example:
+ nf_conntrack_udp_timeout_stream=60
+ nf_flowtable_udp_timeout=60
+
+This will remove offloaded udp flows after one minute, rather than two.
+
+An earlier version of this patch also cleared the ASSURED bit to
+allow nf_conntrack to evict the entry via early_drop (i.e., table full).
+However, it looks like we can safely assume that connection timed out
+via HW is still in established state, so this isn't needed.
+
+Quoting Oz:
+ [..] the hardware sends all packets with a set FIN flags to sw.
+ [..] Connections that are aged in hardware are expected to be in the
+ established state.
+
+In case it turns out that back-to-sw-path transition can occur for
+'dodgy' connections too (e.g., one side disappeared while software-path
+would have been in RETRANS timeout), we can adjust this later.
+
+Cc: Oz Shlomo <ozsh@nvidia.com>
+Cc: Paul Blakey <paulb@nvidia.com>
+Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+---
+ Documentation/networking/nf_conntrack-sysctl.rst | 10 ----------
+ include/net/netns/conntrack.h | 2 --
+ net/netfilter/nf_conntrack_proto_tcp.c | 1 -
+ net/netfilter/nf_conntrack_proto_udp.c | 1 -
+ net/netfilter/nf_conntrack_standalone.c | 16 ----------------
+ net/netfilter/nf_flow_table_core.c | 11 ++++++++---
+ 6 files changed, 8 insertions(+), 33 deletions(-)
+
+--- a/include/net/netns/conntrack.h
++++ b/include/net/netns/conntrack.h
+@@ -29,7 +29,6 @@ struct nf_tcp_net {
+ int tcp_max_retrans;
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ unsigned int offload_timeout;
+- unsigned int offload_pickup;
+ #endif
+ };
+
+@@ -43,7 +42,6 @@ struct nf_udp_net {
+ unsigned int timeouts[UDP_CT_MAX];
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ unsigned int offload_timeout;
+- unsigned int offload_pickup;
+ #endif
+ };
+
+--- a/net/netfilter/nf_conntrack_proto_tcp.c
++++ b/net/netfilter/nf_conntrack_proto_tcp.c
+@@ -1450,7 +1450,6 @@ void nf_conntrack_tcp_init_net(struct ne
+
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ tn->offload_timeout = 30 * HZ;
+- tn->offload_pickup = 120 * HZ;
+ #endif
+ }
+
+--- a/net/netfilter/nf_conntrack_proto_udp.c
++++ b/net/netfilter/nf_conntrack_proto_udp.c
+@@ -276,7 +276,6 @@ void nf_conntrack_udp_init_net(struct ne
+
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ un->offload_timeout = 30 * HZ;
+- un->offload_pickup = 30 * HZ;
+ #endif
+ }
+
+--- a/net/netfilter/nf_conntrack_standalone.c
++++ b/net/netfilter/nf_conntrack_standalone.c
+@@ -569,7 +569,6 @@ enum nf_ct_sysctl_index {
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
+- NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP,
+ #endif
+ NF_SYSCTL_CT_PROTO_TCP_LOOSE,
+ NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+@@ -578,7 +577,6 @@ enum nf_ct_sysctl_index {
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
+- NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP,
+ #endif
+ NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
+@@ -773,12 +771,6 @@ static struct ctl_table nf_ct_sysctl_tab
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+- [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = {
+- .procname = "nf_flowtable_tcp_pickup",
+- .maxlen = sizeof(unsigned int),
+- .mode = 0644,
+- .proc_handler = proc_dointvec_jiffies,
+- },
+ #endif
+ [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
+ .procname = "nf_conntrack_tcp_loose",
+@@ -821,12 +813,6 @@ static struct ctl_table nf_ct_sysctl_tab
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+- [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = {
+- .procname = "nf_flowtable_udp_pickup",
+- .maxlen = sizeof(unsigned int),
+- .mode = 0644,
+- .proc_handler = proc_dointvec_jiffies,
+- },
+ #endif
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
+ .procname = "nf_conntrack_icmp_timeout",
+@@ -1006,7 +992,6 @@ static void nf_conntrack_standalone_init
+
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
+- table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup;
+ #endif
+
+ }
+@@ -1098,7 +1083,6 @@ static int nf_conntrack_standalone_init_
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
+ #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
+- table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup;
+ #endif
+
+ nf_conntrack_standalone_init_tcp_sysctl(net, table);
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -182,20 +182,25 @@ static void flow_offload_fixup_ct_timeou
+ {
+ struct net *net = nf_ct_net(ct);
+ int l4num = nf_ct_protonum(ct);
+- unsigned int timeout;
++ s32 timeout;
+
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+- timeout = tn->offload_pickup;
++ timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
++ timeout -= tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+- timeout = tn->offload_pickup;
++ timeout = tn->timeouts[UDP_CT_REPLIED];
++ timeout -= tn->offload_timeout;
+ } else {
+ return;
+ }
+
++ if (timeout < 0)
++ timeout = 0;
++
+ if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ }
--- /dev/null
+From b8835ba8c029b5c9ada5666754526c2b00f7ea80 Mon Sep 17 00:00:00 2001
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+Date: Tue, 17 May 2022 10:44:14 +0200
+Subject: netfilter: flowtable: fix TCP flow teardown
+
+[ Upstream commit e5eaac2beb54f0a16ff851125082d9faeb475572 ]
+
+This patch addresses three possible problems:
+
+1. ct gc may race to undo the timeout adjustment of the packet path, leaving
+ the conntrack entry in place with the internal offload timeout (one day).
+
+2. ct gc removes the ct because the IPS_OFFLOAD_BIT is not set and the CLOSE
+ timeout is reached before the flow offload del.
+
+3. tcp ct is always set to ESTABLISHED with a very long timeout
+ in flow offload teardown/delete even though the state might be already
+ CLOSED. Also as a remark we cannot assume that the FIN or RST packet
+ is hitting flow table teardown as the packet might get bumped to the
+ slow path in nftables.
+
+This patch resets IPS_OFFLOAD_BIT from flow_offload_teardown(), so
+conntrack handles the tcp rst/fin packet which triggers the CLOSE/FIN
+state transition.
+
+Moreover, teturn the connection's ownership to conntrack upon teardown
+by clearing the offload flag and fixing the established timeout value.
+The flow table GC thread will asynchonrnously free the flow table and
+hardware offload entries.
+
+Before this patch, the IPS_OFFLOAD_BIT remained set for expired flows on
+which is also misleading since the flow is back to classic conntrack
+path.
+
+If nf_ct_delete() removes the entry from the conntrack table, then it
+calls nf_ct_put() which decrements the refcnt. This is not a problem
+because the flowtable holds a reference to the conntrack object from
+flow_offload_alloc() path which is released via flow_offload_free().
+
+This patch also updates nft_flow_offload to skip packets in SYN_RECV
+state. Since we might miss or bump packets to slow path, we do not know
+what will happen there while we are still in SYN_RECV, this patch
+postpones offload up to the next packet which also aligns to the
+existing behaviour in tc-ct.
+
+flow_offload_teardown() does not reset the existing tcp state from
+flow_offload_fixup_tcp() to ESTABLISHED anymore, packets bump to slow
+path might have already update the state to CLOSE/FIN.
+
+Joint work with Oz and Sven.
+
+Fixes: 1e5b2471bcc4 ("netfilter: nf_flow_table: teardown flow timeout race")
+Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
+Signed-off-by: Sven Auhagen <sven.auhagen@voleatech.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_flow_table_core.c | 33 +++++++-----------------------
+ net/netfilter/nft_flow_offload.c | 3 ++-
+ 2 files changed, 9 insertions(+), 27 deletions(-)
+
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -173,12 +173,11 @@ EXPORT_SYMBOL_GPL(flow_offload_route_ini
+
+ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+ {
+- tcp->state = TCP_CONNTRACK_ESTABLISHED;
+ tcp->seen[0].td_maxwin = 0;
+ tcp->seen[1].td_maxwin = 0;
+ }
+
+-static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
++static void flow_offload_fixup_ct(struct nf_conn *ct)
+ {
+ struct net *net = nf_ct_net(ct);
+ int l4num = nf_ct_protonum(ct);
+@@ -187,7 +186,9 @@ static void flow_offload_fixup_ct_timeou
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+- timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
++ flow_offload_fixup_tcp(&ct->proto.tcp);
++
++ timeout = tn->timeouts[ct->proto.tcp.state];
+ timeout -= tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+@@ -205,18 +206,6 @@ static void flow_offload_fixup_ct_timeou
+ WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ }
+
+-static void flow_offload_fixup_ct_state(struct nf_conn *ct)
+-{
+- if (nf_ct_protonum(ct) == IPPROTO_TCP)
+- flow_offload_fixup_tcp(&ct->proto.tcp);
+-}
+-
+-static void flow_offload_fixup_ct(struct nf_conn *ct)
+-{
+- flow_offload_fixup_ct_state(ct);
+- flow_offload_fixup_ct_timeout(ct);
+-}
+-
+ static void flow_offload_route_release(struct flow_offload *flow)
+ {
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+@@ -353,22 +342,14 @@ static void flow_offload_del(struct nf_f
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+ nf_flow_offload_rhash_params);
+-
+- clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+-
+- if (nf_flow_has_expired(flow))
+- flow_offload_fixup_ct(flow->ct);
+- else
+- flow_offload_fixup_ct_timeout(flow->ct);
+-
+ flow_offload_free(flow);
+ }
+
+ void flow_offload_teardown(struct flow_offload *flow)
+ {
++ clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
+ set_bit(NF_FLOW_TEARDOWN, &flow->flags);
+-
+- flow_offload_fixup_ct_state(flow->ct);
++ flow_offload_fixup_ct(flow->ct);
+ }
+ EXPORT_SYMBOL_GPL(flow_offload_teardown);
+
+@@ -437,7 +418,7 @@ static void nf_flow_offload_gc_step(stru
+
+ if (nf_flow_has_expired(flow) ||
+ nf_ct_is_dying(flow->ct))
+- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
++ flow_offload_teardown(flow);
+
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (test_bit(NF_FLOW_HW, &flow->flags)) {
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -268,6 +268,12 @@ static bool nft_flow_offload_skip(struct
+ return false;
+ }
+
++static bool nf_conntrack_tcp_established(const struct nf_conn *ct)
++{
++ return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED &&
++ test_bit(IPS_ASSURED_BIT, &ct->status);
++}
++
+ static void nft_flow_offload_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+@@ -293,7 +299,8 @@ static void nft_flow_offload_eval(const
+ case IPPROTO_TCP:
+ tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff,
+ sizeof(_tcph), &_tcph);
+- if (unlikely(!tcph || tcph->fin || tcph->rst))
++ if (unlikely(!tcph || tcph->fin || tcph->rst ||
++ !nf_conntrack_tcp_established(ct)))
+ goto out;
+ break;
+ case IPPROTO_UDP:
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-@@ -401,8 +400,7 @@ flow_offload_lookup(struct nf_flowtable
+@@ -380,8 +379,7 @@ flow_offload_lookup(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
void (*iter)(struct flow_offload *flow, void *data),
void *data)
{
-@@ -434,6 +432,7 @@ nf_flow_table_iterate(struct nf_flowtabl
+@@ -413,6 +411,7 @@ nf_flow_table_iterate(struct nf_flowtabl
return err;
}
+#endif /* _XT_FLOWOFFLOAD_H */
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
-@@ -270,6 +270,10 @@ void nf_flow_table_free(struct nf_flowta
+@@ -271,6 +271,10 @@ void nf_flow_table_free(struct nf_flowta
void flow_offload_teardown(struct flow_offload *flow);
static bool enable_hooks __read_mostly;
MODULE_PARM_DESC(enable_hooks, "Always enable conntrack hooks");
module_param(enable_hooks, bool, 0000);
-@@ -660,6 +663,7 @@ enum nf_ct_sysctl_index {
+@@ -658,6 +661,7 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM,
#endif
__NF_SYSCTL_CT_LAST_SYSCTL,
};
-@@ -1014,6 +1018,13 @@ static struct ctl_table nf_ct_sysctl_tab
+@@ -1000,6 +1004,13 @@ static struct ctl_table nf_ct_sysctl_tab
.proc_handler = proc_dointvec_jiffies,
},
#endif
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
-@@ -331,8 +331,10 @@ void flow_offload_refresh(struct nf_flow
+@@ -318,8 +318,10 @@ void flow_offload_refresh(struct nf_flow
u32 timeout;
timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);