--- /dev/null
+From 21d81d05787908b13a4079f42a63a5b3254b7ab4 Mon Sep 17 00:00:00 2001
+From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+Date: Wed, 13 Mar 2019 20:54:49 +0000
+Subject: [PATCH] net: sched: Introduce act_ctinfo action
+
+ctinfo is a new tc filter action module. It is designed to restore DSCPs
+stored in conntrack marks
+
+The feature is intended for use and has been found useful for restoring
+ingress classifications based on egress classifications across links
+that bleach or otherwise change DSCP, typically home ISP Internet links.
+Restoring DSCP on ingress on the WAN link allows qdiscs such as CAKE to
+shape inbound packets according to policies that are easier to implement
+on egress.
+
+Ingress classification is traditionally a challenging task since
+iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
+lookups, hence are unable to see internal IPv4 addresses as used on the
+typical home masquerading gateway.
+
+ctinfo understands the following parameters:
+
+dscp mask[/statemask]
+
+mask - a 32 bit mask of at least 6 contiguous bits where conndscp will
+place the DSCP in conntrack mark. The DSCP is left-shifted by the
+number of unset lower bits of the mask before storing into the mark
+field.
+
+statemask - a 32 bit mask of (usually) 1 bit length, outside the area
+specified by mask. This represents a conditional operation flag the
+DSCP is only restored if the flag is set. This is useful to implement a
+'one shot' iptables based classification where the 'complicated'
+iptables rules are only run once to classify the connection on initial
+(egress) packet and subsequent packets are all marked/restored with the
+same DSCP. A mask of zero disables the conditional behaviour.
+
+optional parameters:
+
+zone - conntrack zone
+
+control - action related control (reclassify | pipe | drop | continue |
+ok | goto chain <CHAIN_INDEX>
+
+Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+---
+ include/net/tc_act/tc_ctinfo.h | 28 ++
+ include/uapi/linux/pkt_cls.h | 3 +-
+ include/uapi/linux/tc_act/tc_ctinfo.h | 34 +++
+ net/sched/Kconfig | 13 +
+ net/sched/Makefile | 1 +
+ net/sched/act_ctinfo.c | 394 ++++++++++++++++++++++++++
+ 6 files changed, 472 insertions(+), 1 deletion(-)
+ create mode 100644 include/net/tc_act/tc_ctinfo.h
+ create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h
+ create mode 100644 net/sched/act_ctinfo.c
+
+diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
+new file mode 100644
+index 000000000000..d6a688571672
+--- /dev/null
++++ b/include/net/tc_act/tc_ctinfo.h
+@@ -0,0 +1,28 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef __NET_TC_CTINFO_H
++#define __NET_TC_CTINFO_H
++
++#include <net/act_api.h>
++
++struct tcf_ctinfo_params {
++ struct rcu_head rcu;
++ struct net *net;
++ u32 dscpmask;
++ u32 dscpstatemask;
++ u32 cpmarkmask;
++ u16 zone;
++ u8 mode;
++ u8 dscpmaskshift;
++};
++
++struct tcf_ctinfo {
++ struct tc_action common;
++ struct tcf_ctinfo_params __rcu *params;
++ u64 stats_dscp_set;
++ u64 stats_dscp_error;
++ u64 stats_cpmark_set;
++};
++
++#define to_ctinfo(a) ((struct tcf_ctinfo *)a)
++
++#endif /* __NET_TC_CTINFO_H */
+diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
+index 46c506615f4a..408b02fbb34a 100644
+--- a/include/uapi/linux/pkt_cls.h
++++ b/include/uapi/linux/pkt_cls.h
+@@ -66,7 +66,8 @@ enum {
+ TCA_ID_UNSPEC=0,
+ TCA_ID_POLICE=1,
+ /* other actions go here */
+- __TCA_ID_MAX=255
++ TCA_ID_CTINFO=27,
++ __TCA_ID_MAX = 255
+ };
+
+ #define TCA_ID_MAX __TCA_ID_MAX
+diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h
+new file mode 100644
+index 000000000000..da803e05a89b
+--- /dev/null
++++ b/include/uapi/linux/tc_act/tc_ctinfo.h
+@@ -0,0 +1,34 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++#ifndef __UAPI_TC_CTINFO_H
++#define __UAPI_TC_CTINFO_H
++
++#include <linux/types.h>
++#include <linux/pkt_cls.h>
++
++struct tc_ctinfo {
++ tc_gen;
++};
++
++enum {
++ TCA_CTINFO_UNSPEC,
++ TCA_CTINFO_PAD,
++ TCA_CTINFO_TM,
++ TCA_CTINFO_ACT,
++ TCA_CTINFO_ZONE,
++ TCA_CTINFO_PARMS_DSCP_MASK,
++ TCA_CTINFO_PARMS_DSCP_STATEMASK,
++ TCA_CTINFO_PARMS_CPMARK_MASK,
++ TCA_CTINFO_STATS_DSCP_SET,
++ TCA_CTINFO_STATS_DSCP_ERROR,
++ TCA_CTINFO_STATS_CPMARK_SET,
++ __TCA_CTINFO_MAX
++};
++
++#define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1)
++
++enum {
++ CTINFO_MODE_DSCP = BIT(0),
++ CTINFO_MODE_CPMARK = BIT(1)
++};
++
++#endif
+diff --git a/net/sched/Kconfig b/net/sched/Kconfig
+index e70ed26485a2..962d90f72f54 100644
+--- a/net/sched/Kconfig
++++ b/net/sched/Kconfig
+@@ -808,6 +808,19 @@ config NET_ACT_CONNMARK
+ To compile this code as a module, choose M here: the
+ module will be called act_connmark.
+
++config NET_ACT_CTINFO
++ tristate "Netfilter Connmark to DSCP Retriever"
++ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
++ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
++ help
++ Say Y here to allow transfer of a connmark stored DSCP into
++ ipv4/v6 diffserv
++
++ If unsure, say N.
++
++ To compile this code as a module, choose M here: the
++ module will be called act_ctinfo.
++
+ config NET_ACT_SKBMOD
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+diff --git a/net/sched/Makefile b/net/sched/Makefile
+index 9e43a4721ef8..44ee5b87b895 100644
+--- a/net/sched/Makefile
++++ b/net/sched/Makefile
+@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+ obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
+ obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
+ obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
++obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
+ obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
+ obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
+ obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
+diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
+new file mode 100644
+index 000000000000..e65344e32801
+--- /dev/null
++++ b/net/sched/act_ctinfo.c
+@@ -0,0 +1,394 @@
++// SPDX-License-Identifier: GPL-2.0+
++/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
++ *
++ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/skbuff.h>
++#include <linux/rtnetlink.h>
++#include <linux/pkt_cls.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <net/netlink.h>
++#include <net/pkt_sched.h>
++#include <net/act_api.h>
++#include <net/pkt_cls.h>
++#include <uapi/linux/tc_act/tc_ctinfo.h>
++#include <net/tc_act/tc_ctinfo.h>
++
++#include <net/netfilter/nf_conntrack.h>
++#include <net/netfilter/nf_conntrack_core.h>
++#include <net/netfilter/nf_conntrack_ecache.h>
++#include <net/netfilter/nf_conntrack_zones.h>
++
++static struct tc_action_ops act_ctinfo_ops;
++static unsigned int ctinfo_net_id;
++
++static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
++ struct tcf_ctinfo_params *cp,
++ struct sk_buff *skb, int wlen, int proto)
++{
++ u8 dscp,newdscp;
++
++ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
++ ~INET_ECN_MASK;
++
++ /* mark contains DSCP so restore DSCP bits from ct->mark into diffserv */
++ /* using overlimits stats to count how many DSCP updates */
++ switch (proto) {
++ case NFPROTO_IPV4:
++ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
++ if (dscp != newdscp) {
++ if (likely(!skb_try_make_writable(skb, wlen))) {
++ ipv4_change_dsfield(ip_hdr(skb),
++ INET_ECN_MASK,
++ newdscp);
++ ca->stats_dscp_set++;
++ } else {
++ ca->stats_dscp_error++;
++ }
++ }
++ break;
++ case NFPROTO_IPV6:
++ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
++ if (dscp != newdscp) {
++ if (likely(!skb_try_make_writable(skb, wlen))) {
++ ipv6_change_dsfield(ipv6_hdr(skb),
++ INET_ECN_MASK,
++ newdscp);
++ ca->stats_dscp_set++;
++ } else {
++ ca->stats_dscp_error++;
++ }
++ }
++ break;
++ default:
++ break;
++ }
++}
++
++static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
++ struct tcf_ctinfo_params *cp,
++ struct sk_buff *skb)
++{
++ ca->stats_cpmark_set++;
++ skb->mark = ct->mark & cp->cpmarkmask;
++}
++
++static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
++ struct tcf_result *res)
++{
++ const struct nf_conntrack_tuple_hash *thash = NULL;
++ struct tcf_ctinfo *ca = to_ctinfo(a);
++ struct nf_conntrack_tuple tuple;
++ struct nf_conntrack_zone zone;
++ enum ip_conntrack_info ctinfo;
++ struct tcf_ctinfo_params *cp;
++ struct nf_conn *ct;
++ int proto, wlen;
++ int action;
++
++ cp = rcu_dereference_bh(ca->params);
++
++ tcf_lastuse_update(&ca->tcf_tm);
++ bstats_update(&ca->tcf_bstats, skb);
++ action = READ_ONCE(ca->tcf_action);
++
++ wlen = skb_network_offset(skb);
++ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
++ wlen += sizeof(struct iphdr);
++ if (!pskb_may_pull(skb, wlen))
++ goto out;
++
++ proto = NFPROTO_IPV4;
++ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
++ wlen += sizeof(struct ipv6hdr);
++ if (!pskb_may_pull(skb, wlen))
++ goto out;
++
++ proto = NFPROTO_IPV6;
++ } else {
++ goto out;
++ }
++
++ ct = nf_ct_get(skb, &ctinfo);
++ if (!ct) { /* look harder, usually ingress */
++ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
++ proto, cp->net, &tuple))
++ goto out;
++ zone.id = cp->zone;
++ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
++
++ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
++ if (!thash)
++ goto out;
++
++ ct = nf_ct_tuplehash_to_ctrack(thash);
++ }
++
++ if (cp->mode & CTINFO_MODE_DSCP)
++ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
++ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
++
++ if (cp->mode & CTINFO_MODE_CPMARK)
++ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
++
++ if (thash)
++ nf_ct_put(ct);
++out:
++ return action;
++}
++
++static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
++ [TCA_CTINFO_ACT] = { .len = sizeof(struct
++ tc_ctinfo) },
++ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
++ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
++ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
++ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
++};
++
++static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
++ struct nlattr *est, struct tc_action **a,
++ int ovr, int bind)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++ struct nlattr *tb[TCA_CTINFO_MAX + 1];
++ struct tcf_ctinfo_params *cp_new;
++/* struct tcf_chain *goto_ch = NULL; */
++ u32 dscpmask = 0, dscpstatemask;
++ struct tc_ctinfo *actparm;
++ struct tcf_ctinfo *ci;
++ u8 dscpmaskshift;
++ int ret = 0, err;
++
++ if (!nla)
++ return -EINVAL;
++
++ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, NULL);
++ if (err < 0)
++ return err;
++
++ if (!tb[TCA_CTINFO_ACT])
++ return -EINVAL;
++ actparm = nla_data(tb[TCA_CTINFO_ACT]);
++
++ /* do some basic validation here before dynamically allocating things */
++ /* that we would otherwise have to clean up. */
++ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
++ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
++ /* need contiguous 6 bit mask */
++ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
++ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f)
++ return -EINVAL;
++ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
++ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
++ /* mask & statemask must not overlap */
++ if (dscpmask & dscpstatemask)
++ return -EINVAL;
++ }
++ /* done the validation:now to the actual action allocation */
++ err = tcf_idr_check(tn, actparm->index, a, bind);
++ if (!err) {
++ ret = tcf_idr_create(tn, actparm->index, est, a,
++ &act_ctinfo_ops, bind, false);
++ if (ret) {
++ /* tcf_idr_cleanup(tn, actparm->index); */
++ return ret;
++ }
++ ret = ACT_P_CREATED;
++ } else if (err > 0) {
++ if (bind) /* don't override defaults */
++ return 0;
++ if (!ovr) {
++ tcf_idr_release(*a, bind);
++ return -EEXIST;
++ }
++ } else {
++ return err;
++ }
++
++/* err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
++ if (err < 0)
++ goto release_idr;
++ */
++
++ ci = to_ctinfo(*a);
++
++ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
++ if (unlikely(!cp_new)) {
++ err = -ENOMEM;
++ goto put_chain;
++ }
++
++ cp_new->net = net;
++ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
++ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
++ if (dscpmask) {
++ cp_new->dscpmask = dscpmask;
++ cp_new->dscpmaskshift = dscpmaskshift;
++ cp_new->dscpstatemask = dscpstatemask;
++ cp_new->mode |= CTINFO_MODE_DSCP;
++ }
++
++ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
++ cp_new->cpmarkmask = nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
++ cp_new->mode |= CTINFO_MODE_CPMARK;
++ }
++
++ spin_lock_bh(&ci->tcf_lock);
++/* goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); */
++ ci->tcf_action = actparm->action;
++ rcu_swap_protected(ci->params, cp_new,
++ lockdep_is_held(&ci->tcf_lock));
++ spin_unlock_bh(&ci->tcf_lock);
++
++/* if (goto_ch)
++ tcf_chain_put_by_act(goto_ch); */
++ if (cp_new)
++ kfree_rcu(cp_new, rcu);
++
++ if (ret == ACT_P_CREATED)
++ tcf_idr_insert(tn, *a);
++
++ return ret;
++
++put_chain:
++/* if (goto_ch)
++ tcf_chain_put_by_act(goto_ch); */
++/*release_idr:*/
++ tcf_idr_release(*a, bind);
++ return err;
++}
++
++static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
++ int bind, int ref)
++{
++ struct tcf_ctinfo *ci = to_ctinfo(a);
++ struct tc_ctinfo opt = {
++ .index = ci->tcf_index,
++ .refcnt = ci->tcf_refcnt - ref,
++ .bindcnt = ci->tcf_bindcnt - bind,
++ };
++ unsigned char *b = skb_tail_pointer(skb);
++ struct tcf_ctinfo_params *cp;
++ struct tcf_t t;
++
++ spin_lock_bh(&ci->tcf_lock);
++ cp = rcu_dereference_protected(ci->params,
++ lockdep_is_held(&ci->tcf_lock));
++
++ tcf_tm_dump(&t, &ci->tcf_tm);
++ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ opt.action = ci->tcf_action;
++ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
++ goto nla_put_failure;
++
++ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
++ goto nla_put_failure;
++
++ if (cp->mode & CTINFO_MODE_DSCP) {
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
++ cp->dscpmask))
++ goto nla_put_failure;
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
++ cp->dscpstatemask))
++ goto nla_put_failure;
++ }
++
++ if (cp->mode & CTINFO_MODE_CPMARK) {
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
++ cp->cpmarkmask))
++ goto nla_put_failure;
++ }
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
++ ci->stats_dscp_set, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
++ ci->stats_dscp_error, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
++ ci->stats_cpmark_set, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ spin_unlock_bh(&ci->tcf_lock);
++ return skb->len;
++
++nla_put_failure:
++ spin_unlock_bh(&ci->tcf_lock);
++ nlmsg_trim(skb, b);
++ return -1;
++}
++
++static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
++ struct netlink_callback *cb, int type,
++ const struct tc_action_ops *ops)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tcf_generic_walker(tn, skb, cb, type, ops);
++}
++
++static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tcf_idr_search(tn, a, index);
++}
++
++static struct tc_action_ops act_ctinfo_ops = {
++ .kind = "ctinfo",
++ .type = TCA_ID_CTINFO,
++ .owner = THIS_MODULE,
++ .act = tcf_ctinfo_act,
++ .dump = tcf_ctinfo_dump,
++ .init = tcf_ctinfo_init,
++ .walk = tcf_ctinfo_walker,
++ .lookup = tcf_ctinfo_search,
++ .size = sizeof(struct tcf_ctinfo),
++};
++
++static __net_init int ctinfo_init_net(struct net *net)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tc_action_net_init(tn, &act_ctinfo_ops);
++}
++
++static void __net_exit ctinfo_exit_net(struct net *net)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ tc_action_net_exit(tn);
++}
++
++static struct pernet_operations ctinfo_net_ops = {
++ .init = ctinfo_init_net,
++ .exit = ctinfo_exit_net,
++ .id = &ctinfo_net_id,
++ .size = sizeof(struct tc_action_net),
++};
++
++static int __init ctinfo_init_module(void)
++{
++ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
++}
++
++static void __exit ctinfo_cleanup_module(void)
++{
++ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
++}
++
++module_init(ctinfo_init_module);
++module_exit(ctinfo_cleanup_module);
++MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
++MODULE_DESCRIPTION("Conntrack mark to DSCP restoring");
++MODULE_LICENSE("GPL");
+--
+2.20.1 (Apple Git-117)
+
--- /dev/null
+From 402c8d61d2f27060be14849fcb30682f75f3bf3b Mon Sep 17 00:00:00 2001
+From: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+Date: Wed, 13 Mar 2019 20:54:49 +0000
+Subject: [PATCH] net: sched: Introduce act_ctinfo action
+
+ctinfo is a new tc filter action module. It is designed to restore DSCPs
+stored in conntrack marks into the ipv4/v6 diffserv field.
+
+The feature is intended for use and has been found useful for restoring
+ingress classifications based on egress classifications across links
+that bleach or otherwise change DSCP, typically home ISP Internet links.
+Restoring DSCP on ingress on the WAN link allows qdiscs such as CAKE to
+shape inbound packets according to policies that are easier to indicate
+on egress.
+
+Ingress classification is traditionally a challenging task since
+iptables rules haven't yet run and tc filter/eBPF programs are pre-NAT
+lookups, hence are unable to see internal IPv4 addresses as used on the
+typical home masquerading gateway.
+
+ctinfo understands the following parameters:
+
+dscp dscpmask[/statemask]
+
+dscpmask - a 32 bit mask of at least 6 contiguous bits and indicates
+where ctinfo will find the DSCP bits stored in the conntrack mark.
+
+statemask - a 32 bit mask of (usually) 1 bit length, outside the area
+specified by dscpmask. This represents a conditional operation flag
+whereby the DSCP is only restored if the flag is set. This is useful to
+implement a 'one shot' iptables based classification where the
+'complicated' iptables rules are only run once to classify the
+connection on initial (egress) packet and subsequent packets are all
+marked/restored with the same DSCP. A mask of zero disables the
+conditional behaviour ie. the conntrack mark DSCP bits are always
+restored to the ip diffserv field (assuming the conntrack entry is found
+& the skb is an ipv4/ipv6 type)
+
+optional parameters:
+
+zone - conntrack zone
+
+control - action related control (reclassify | pipe | drop | continue |
+ok | goto chain <CHAIN_INDEX>)
+
+e.g. dscp 0xfc000000/0x01000000
+
+|----0xFC----conntrack mark----000000---|
+| Bits 31-26 | bit 25 | bit24 |~~~ Bit 0|
+| DSCP | unused | flag |unused |
+|-----------------------0x01---000000---|
+ | |
+ | |
+ ---| Conditional flag
+ v only restore if set
+|-ip diffserv-|
+| 6 bits |
+|-------------|
+
+Signed-off-by: Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
+---
+ include/net/tc_act/tc_ctinfo.h | 28 ++
+ include/uapi/linux/pkt_cls.h | 3 +-
+ include/uapi/linux/tc_act/tc_ctinfo.h | 34 ++
+ net/sched/Kconfig | 17 +
+ net/sched/Makefile | 1 +
+ net/sched/act_ctinfo.c | 395 ++++++++++++++++++++++
+ tools/testing/selftests/tc-testing/config | 1 +
+ 7 files changed, 478 insertions(+), 1 deletion(-)
+ create mode 100644 include/net/tc_act/tc_ctinfo.h
+ create mode 100644 include/uapi/linux/tc_act/tc_ctinfo.h
+ create mode 100644 net/sched/act_ctinfo.c
+
+diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
+new file mode 100644
+index 000000000000..d6a688571672
+--- /dev/null
++++ b/include/net/tc_act/tc_ctinfo.h
+@@ -0,0 +1,28 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef __NET_TC_CTINFO_H
++#define __NET_TC_CTINFO_H
++
++#include <net/act_api.h>
++
++struct tcf_ctinfo_params {
++ struct rcu_head rcu;
++ struct net *net;
++ u32 dscpmask;
++ u32 dscpstatemask;
++ u32 cpmarkmask;
++ u16 zone;
++ u8 mode;
++ u8 dscpmaskshift;
++};
++
++struct tcf_ctinfo {
++ struct tc_action common;
++ struct tcf_ctinfo_params __rcu *params;
++ u64 stats_dscp_set;
++ u64 stats_dscp_error;
++ u64 stats_cpmark_set;
++};
++
++#define to_ctinfo(a) ((struct tcf_ctinfo *)a)
++
++#endif /* __NET_TC_CTINFO_H */
+diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
+index be382fb0592d..71e10c5a96a0 100644
+--- a/include/uapi/linux/pkt_cls.h
++++ b/include/uapi/linux/pkt_cls.h
+@@ -68,7 +68,8 @@ enum {
+ TCA_ID_UNSPEC=0,
+ TCA_ID_POLICE=1,
+ /* other actions go here */
+- __TCA_ID_MAX=255
++ TCA_ID_CTINFO,
++ __TCA_ID_MAX = 255
+ };
+
+ #define TCA_ID_MAX __TCA_ID_MAX
+diff --git a/include/uapi/linux/tc_act/tc_ctinfo.h b/include/uapi/linux/tc_act/tc_ctinfo.h
+new file mode 100644
+index 000000000000..da803e05a89b
+--- /dev/null
++++ b/include/uapi/linux/tc_act/tc_ctinfo.h
+@@ -0,0 +1,34 @@
++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
++#ifndef __UAPI_TC_CTINFO_H
++#define __UAPI_TC_CTINFO_H
++
++#include <linux/types.h>
++#include <linux/pkt_cls.h>
++
++struct tc_ctinfo {
++ tc_gen;
++};
++
++enum {
++ TCA_CTINFO_UNSPEC,
++ TCA_CTINFO_PAD,
++ TCA_CTINFO_TM,
++ TCA_CTINFO_ACT,
++ TCA_CTINFO_ZONE,
++ TCA_CTINFO_PARMS_DSCP_MASK,
++ TCA_CTINFO_PARMS_DSCP_STATEMASK,
++ TCA_CTINFO_PARMS_CPMARK_MASK,
++ TCA_CTINFO_STATS_DSCP_SET,
++ TCA_CTINFO_STATS_DSCP_ERROR,
++ TCA_CTINFO_STATS_CPMARK_SET,
++ __TCA_CTINFO_MAX
++};
++
++#define TCA_CTINFO_MAX (__TCA_CTINFO_MAX - 1)
++
++enum {
++ CTINFO_MODE_DSCP = BIT(0),
++ CTINFO_MODE_CPMARK = BIT(1)
++};
++
++#endif
+diff --git a/net/sched/Kconfig b/net/sched/Kconfig
+index e95741388311..1d79d5dba6e4 100644
+--- a/net/sched/Kconfig
++++ b/net/sched/Kconfig
+@@ -866,6 +866,23 @@ config NET_ACT_CONNMARK
+ To compile this code as a module, choose M here: the
+ module will be called act_connmark.
+
++config NET_ACT_CTINFO
++ tristate "Netfilter Connection Mark Actions"
++ depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
++ depends on NF_CONNTRACK && NF_CONNTRACK_MARK
++ help
++ Say Y here to allow transfer of a connmark stored information.
++ Current actions transfer connmark stored DSCP into
++ ipv4/v6 diffserv and/or to transfer connmark to packet
++ mark. Both are useful for restoring egress based marks
++ back onto ingress connections for qdisc priority mapping
++ purposes.
++
++ If unsure, say N.
++
++ To compile this code as a module, choose M here: the
++ module will be called act_ctinfo.
++
+ config NET_ACT_SKBMOD
+ tristate "skb data modification action"
+ depends on NET_CLS_ACT
+diff --git a/net/sched/Makefile b/net/sched/Makefile
+index f0403f49edcb..bb3c2bc44af7 100644
+--- a/net/sched/Makefile
++++ b/net/sched/Makefile
+@@ -21,6 +21,7 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
+ obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
+ obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
+ obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
++obj-$(CONFIG_NET_ACT_CTINFO) += act_ctinfo.o
+ obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o
+ obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
+ obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
+diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
+new file mode 100644
+index 000000000000..8975cb4976aa
+--- /dev/null
++++ b/net/sched/act_ctinfo.c
+@@ -0,0 +1,395 @@
++// SPDX-License-Identifier: GPL-2.0+
++/* net/sched/act_ctinfo.c netfilter ctinfo connmark actions
++ *
++ * Copyright (c) 2019 Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>
++ */
++
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/skbuff.h>
++#include <linux/rtnetlink.h>
++#include <linux/pkt_cls.h>
++#include <linux/ip.h>
++#include <linux/ipv6.h>
++#include <net/netlink.h>
++#include <net/pkt_sched.h>
++#include <net/act_api.h>
++#include <net/pkt_cls.h>
++#include <uapi/linux/tc_act/tc_ctinfo.h>
++#include <net/tc_act/tc_ctinfo.h>
++
++#include <net/netfilter/nf_conntrack.h>
++#include <net/netfilter/nf_conntrack_core.h>
++#include <net/netfilter/nf_conntrack_ecache.h>
++#include <net/netfilter/nf_conntrack_zones.h>
++
++static struct tc_action_ops act_ctinfo_ops;
++static unsigned int ctinfo_net_id;
++
++static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
++ struct tcf_ctinfo_params *cp,
++ struct sk_buff *skb, int wlen, int proto)
++{
++ u8 dscp, newdscp;
++
++ newdscp = (((ct->mark & cp->dscpmask) >> cp->dscpmaskshift) << 2) &
++ ~INET_ECN_MASK;
++
++ switch (proto) {
++ case NFPROTO_IPV4:
++ dscp = ipv4_get_dsfield(ip_hdr(skb)) & ~INET_ECN_MASK;
++ if (dscp != newdscp) {
++ if (likely(!skb_try_make_writable(skb, wlen))) {
++ ipv4_change_dsfield(ip_hdr(skb),
++ INET_ECN_MASK,
++ newdscp);
++ ca->stats_dscp_set++;
++ } else {
++ ca->stats_dscp_error++;
++ }
++ }
++ break;
++ case NFPROTO_IPV6:
++ dscp = ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK;
++ if (dscp != newdscp) {
++ if (likely(!skb_try_make_writable(skb, wlen))) {
++ ipv6_change_dsfield(ipv6_hdr(skb),
++ INET_ECN_MASK,
++ newdscp);
++ ca->stats_dscp_set++;
++ } else {
++ ca->stats_dscp_error++;
++ }
++ }
++ break;
++ default:
++ break;
++ }
++}
++
++static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca,
++ struct tcf_ctinfo_params *cp,
++ struct sk_buff *skb)
++{
++ ca->stats_cpmark_set++;
++ skb->mark = ct->mark & cp->cpmarkmask;
++}
++
++static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a,
++ struct tcf_result *res)
++{
++ const struct nf_conntrack_tuple_hash *thash = NULL;
++ struct tcf_ctinfo *ca = to_ctinfo(a);
++ struct nf_conntrack_tuple tuple;
++ struct nf_conntrack_zone zone;
++ enum ip_conntrack_info ctinfo;
++ struct tcf_ctinfo_params *cp;
++ struct nf_conn *ct;
++ int proto, wlen;
++ int action;
++
++ cp = rcu_dereference_bh(ca->params);
++
++ tcf_lastuse_update(&ca->tcf_tm);
++ bstats_update(&ca->tcf_bstats, skb);
++ action = READ_ONCE(ca->tcf_action);
++
++ wlen = skb_network_offset(skb);
++ if (tc_skb_protocol(skb) == htons(ETH_P_IP)) {
++ wlen += sizeof(struct iphdr);
++ if (!pskb_may_pull(skb, wlen))
++ goto out;
++
++ proto = NFPROTO_IPV4;
++ } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) {
++ wlen += sizeof(struct ipv6hdr);
++ if (!pskb_may_pull(skb, wlen))
++ goto out;
++
++ proto = NFPROTO_IPV6;
++ } else {
++ goto out;
++ }
++
++ ct = nf_ct_get(skb, &ctinfo);
++ if (!ct) { /* look harder, usually ingress */
++ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
++ proto, cp->net, &tuple))
++ goto out;
++ zone.id = cp->zone;
++ zone.dir = NF_CT_DEFAULT_ZONE_DIR;
++
++ thash = nf_conntrack_find_get(cp->net, &zone, &tuple);
++ if (!thash)
++ goto out;
++
++ ct = nf_ct_tuplehash_to_ctrack(thash);
++ }
++
++ if (cp->mode & CTINFO_MODE_DSCP)
++ if (!cp->dscpstatemask || (ct->mark & cp->dscpstatemask))
++ tcf_ctinfo_dscp_set(ct, ca, cp, skb, wlen, proto);
++
++ if (cp->mode & CTINFO_MODE_CPMARK)
++ tcf_ctinfo_cpmark_set(ct, ca, cp, skb);
++
++ if (thash)
++ nf_ct_put(ct);
++out:
++ return action;
++}
++
++static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = {
++ [TCA_CTINFO_ACT] = { .len = sizeof(struct
++ tc_ctinfo) },
++ [TCA_CTINFO_ZONE] = { .type = NLA_U16 },
++ [TCA_CTINFO_PARMS_DSCP_MASK] = { .type = NLA_U32 },
++ [TCA_CTINFO_PARMS_DSCP_STATEMASK] = { .type = NLA_U32 },
++ [TCA_CTINFO_PARMS_CPMARK_MASK] = { .type = NLA_U32 },
++};
++
++static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
++ struct nlattr *est, struct tc_action **a,
++ int ovr, int bind, bool rtnl_held,
++ struct netlink_ext_ack *extack)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++ struct nlattr *tb[TCA_CTINFO_MAX + 1];
++ struct tcf_ctinfo_params *cp_new;
++/* struct tcf_chain *goto_ch = NULL; */
++ u32 dscpmask = 0, dscpstatemask;
++ struct tc_ctinfo *actparm;
++ struct tcf_ctinfo *ci;
++ u8 dscpmaskshift;
++ int ret = 0, err;
++
++ if (!nla)
++ return -EINVAL;
++
++ err = nla_parse_nested(tb, TCA_CTINFO_MAX, nla, ctinfo_policy, NULL);
++ if (err < 0)
++ return err;
++
++ if (!tb[TCA_CTINFO_ACT])
++ return -EINVAL;
++ actparm = nla_data(tb[TCA_CTINFO_ACT]);
++
++ /* do some basic validation here before dynamically allocating things */
++ /* that we would otherwise have to clean up. */
++ if (tb[TCA_CTINFO_PARMS_DSCP_MASK]) {
++ dscpmask = nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_MASK]);
++ /* need contiguous 6 bit mask */
++ dscpmaskshift = dscpmask ? __ffs(dscpmask) : 0;
++ if ((~0 & (dscpmask >> dscpmaskshift)) != 0x3f)
++ return -EINVAL;
++ dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ?
++ nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0;
++ /* mask & statemask must not overlap */
++ if (dscpmask & dscpstatemask)
++ return -EINVAL;
++ }
++
++ /* done the validation:now to the actual action allocation */
++ err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
++ if (!err) {
++ ret = tcf_idr_create(tn, actparm->index, est, a,
++ &act_ctinfo_ops, bind, false);
++ if (ret) {
++ tcf_idr_cleanup(tn, actparm->index);
++ return ret;
++ }
++ ret = ACT_P_CREATED;
++ } else if (err > 0) {
++ if (bind) /* don't override defaults */
++ return 0;
++ if (!ovr) {
++ tcf_idr_release(*a, bind);
++ return -EEXIST;
++ }
++ } else {
++ return err;
++ }
++
++/* err = tcf_action_check_ctrlact(actparm->action, tp, &goto_ch, extack);
++ if (err < 0)
++ goto release_idr;
++ */
++
++ ci = to_ctinfo(*a);
++
++ cp_new = kzalloc(sizeof(*cp_new), GFP_KERNEL);
++ if (unlikely(!cp_new)) {
++ err = -ENOMEM;
++ goto put_chain;
++ }
++
++ cp_new->net = net;
++ cp_new->zone = tb[TCA_CTINFO_ZONE] ?
++ nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0;
++ if (dscpmask) {
++ cp_new->dscpmask = dscpmask;
++ cp_new->dscpmaskshift = dscpmaskshift;
++ cp_new->dscpstatemask = dscpstatemask;
++ cp_new->mode |= CTINFO_MODE_DSCP;
++ }
++
++ if (tb[TCA_CTINFO_PARMS_CPMARK_MASK]) {
++ cp_new->cpmarkmask =
++ nla_get_u32(tb[TCA_CTINFO_PARMS_CPMARK_MASK]);
++ cp_new->mode |= CTINFO_MODE_CPMARK;
++ }
++
++ spin_lock_bh(&ci->tcf_lock);
++/* goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); */
++ ci->tcf_action = actparm->action;
++ rcu_swap_protected(ci->params, cp_new,
++ lockdep_is_held(&ci->tcf_lock));
++ spin_unlock_bh(&ci->tcf_lock);
++
++/* if (goto_ch)
++ tcf_chain_put_by_act(goto_ch); */
++ if (cp_new)
++ kfree_rcu(cp_new, rcu);
++
++ if (ret == ACT_P_CREATED)
++ tcf_idr_insert(tn, *a);
++
++ return ret;
++
++put_chain:
++/* if (goto_ch)
++ tcf_chain_put_by_act(goto_ch);
++release_idr: */
++ tcf_idr_release(*a, bind);
++ return err;
++}
++
++static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a,
++ int bind, int ref)
++{
++ struct tcf_ctinfo *ci = to_ctinfo(a);
++ struct tc_ctinfo opt = {
++ .index = ci->tcf_index,
++ .refcnt = refcount_read(&ci->tcf_refcnt) - ref,
++ .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind,
++ };
++ unsigned char *b = skb_tail_pointer(skb);
++ struct tcf_ctinfo_params *cp;
++ struct tcf_t t;
++
++ spin_lock_bh(&ci->tcf_lock);
++ cp = rcu_dereference_protected(ci->params,
++ lockdep_is_held(&ci->tcf_lock));
++
++ tcf_tm_dump(&t, &ci->tcf_tm);
++ if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ opt.action = ci->tcf_action;
++ if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt))
++ goto nla_put_failure;
++
++ if (nla_put_u16(skb, TCA_CTINFO_ZONE, cp->zone))
++ goto nla_put_failure;
++
++ if (cp->mode & CTINFO_MODE_DSCP) {
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_MASK,
++ cp->dscpmask))
++ goto nla_put_failure;
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_DSCP_STATEMASK,
++ cp->dscpstatemask))
++ goto nla_put_failure;
++ }
++
++ if (cp->mode & CTINFO_MODE_CPMARK) {
++ if (nla_put_u32(skb, TCA_CTINFO_PARMS_CPMARK_MASK,
++ cp->cpmarkmask))
++ goto nla_put_failure;
++ }
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET,
++ ci->stats_dscp_set, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR,
++ ci->stats_dscp_error, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET,
++ ci->stats_cpmark_set, TCA_CTINFO_PAD))
++ goto nla_put_failure;
++
++ spin_unlock_bh(&ci->tcf_lock);
++ return skb->len;
++
++nla_put_failure:
++ spin_unlock_bh(&ci->tcf_lock);
++ nlmsg_trim(skb, b);
++ return -1;
++}
++
++static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb,
++ struct netlink_callback *cb, int type,
++ const struct tc_action_ops *ops,
++ struct netlink_ext_ack *extack)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tcf_generic_walker(tn, skb, cb, type, ops, extack);
++}
++
++static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index,
++ struct netlink_ext_ack *extack)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tcf_idr_search(tn, a, index);
++}
++
++static struct tc_action_ops act_ctinfo_ops = {
++ .kind = "ctinfo",
++ .type = TCA_ID_CTINFO,
++ .owner = THIS_MODULE,
++ .act = tcf_ctinfo_act,
++ .dump = tcf_ctinfo_dump,
++ .init = tcf_ctinfo_init,
++ .walk = tcf_ctinfo_walker,
++ .lookup = tcf_ctinfo_search,
++ .size = sizeof(struct tcf_ctinfo),
++};
++
++static __net_init int ctinfo_init_net(struct net *net)
++{
++ struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
++
++ return tc_action_net_init(tn, &act_ctinfo_ops);
++}
++
++static void __net_exit ctinfo_exit_net(struct list_head *net_list)
++{
++ tc_action_net_exit(net_list, ctinfo_net_id);
++}
++
++static struct pernet_operations ctinfo_net_ops = {
++ .init = ctinfo_init_net,
++ .exit_batch = ctinfo_exit_net,
++ .id = &ctinfo_net_id,
++ .size = sizeof(struct tc_action_net),
++};
++
++static int __init ctinfo_init_module(void)
++{
++ return tcf_register_action(&act_ctinfo_ops, &ctinfo_net_ops);
++}
++
++static void __exit ctinfo_cleanup_module(void)
++{
++ tcf_unregister_action(&act_ctinfo_ops, &ctinfo_net_ops);
++}
++
++module_init(ctinfo_init_module);
++module_exit(ctinfo_cleanup_module);
++MODULE_AUTHOR("Kevin Darbyshire-Bryant <ldir@darbyshire-bryant.me.uk>");
++MODULE_DESCRIPTION("Connection tracking mark actions");
++MODULE_LICENSE("GPL");
+diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
+index 203302065458..9d1fddcfb887 100644
+--- a/tools/testing/selftests/tc-testing/config
++++ b/tools/testing/selftests/tc-testing/config
+@@ -37,6 +37,7 @@ CONFIG_NET_ACT_SKBEDIT=m
+ CONFIG_NET_ACT_CSUM=m
+ CONFIG_NET_ACT_VLAN=m
+ CONFIG_NET_ACT_BPF=m
++CONFIG_NET_ACT_CONNDSCP=m
+ CONFIG_NET_ACT_CONNMARK=m
+ CONFIG_NET_ACT_SKBMOD=m
+ CONFIG_NET_ACT_IFE=m
+--
+2.20.1 (Apple Git-117)
+