const struct nft_set_elem *elem);
};
+/**
+ * struct nft_set_desc - description of set elements
+ *
+ * @klen: key length
+ * @dlen: data length
+ * @size: number of set elements
+ */
+struct nft_set_desc {
+ unsigned int klen;
+ unsigned int dlen;
+ unsigned int size;
+};
+
+/**
+ * enum nft_set_class - performance class
+ *
+ * @NFT_LOOKUP_O_1: constant, O(1)
+ * @NFT_LOOKUP_O_LOG_N: logarithmic, O(log N)
+ * @NFT_LOOKUP_O_N: linear, O(N)
+ */
+enum nft_set_class {
+ NFT_SET_CLASS_O_1,
+ NFT_SET_CLASS_O_LOG_N,
+ NFT_SET_CLASS_O_N,
+};
+
+/**
+ * struct nft_set_estimate - estimation of memory and performance
+ * characteristics
+ *
+ * @size: required memory
+ * @class: lookup performance class
+ */
+struct nft_set_estimate {
+ unsigned int size;
+ enum nft_set_class class;
+};
+
/**
* struct nft_set_ops - nf_tables set operations
*
struct nft_set_iter *iter);
unsigned int (*privsize)(const struct nlattr * const nla[]);
+ bool (*estimate)(const struct nft_set_desc *desc,
+ u32 features,
+ struct nft_set_estimate *est);
int (*init)(const struct nft_set *set,
+ const struct nft_set_desc *desc,
const struct nlattr * const nla[]);
void (*destroy)(const struct nft_set *set);
* @name: name of the set
* @ktype: key type (numeric type defined by userspace, not used in the kernel)
* @dtype: data type (verdict or numeric type defined by userspace)
+ * @size: maximum set size
+ * @nelems: number of elements
* @ops: set ops
* @flags: set flags
* @klen: key length
char name[IFNAMSIZ];
u32 ktype;
u32 dtype;
+ u32 size;
+ u32 nelems;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags;
}
EXPORT_SYMBOL_GPL(nft_unregister_set);
-static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const nla[])
+/*
+ * Select a set implementation based on the data characteristics and the
+ * given policy. The total memory use might not be known if no size is
+ * given, in that case the amount of memory per element is used.
+ */
+static const struct nft_set_ops *
+nft_select_set_ops(const struct nlattr * const nla[],
+ const struct nft_set_desc *desc,
+ enum nft_set_policies policy)
{
- const struct nft_set_ops *ops;
+ const struct nft_set_ops *ops, *bops;
+ struct nft_set_estimate est, best;
u32 features;
#ifdef CONFIG_MODULES
features &= NFT_SET_INTERVAL | NFT_SET_MAP;
}
- // FIXME: implement selection properly
+ bops = NULL;
+ best.size = ~0;
+ best.class = ~0;
+
list_for_each_entry(ops, &nf_tables_set_ops, list) {
if ((ops->features & features) != features)
continue;
+ if (!ops->estimate(desc, features, &est))
+ continue;
+
+ switch (policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ if (est.class < best.class)
+ break;
+ if (est.class == best.class && est.size < best.size)
+ break;
+ continue;
+ case NFT_SET_POL_MEMORY:
+ if (est.size < best.size)
+ break;
+ if (est.size == best.size && est.class < best.class)
+ break;
+ continue;
+ default:
+ break;
+ }
+
if (!try_module_get(ops->owner))
continue;
- return ops;
+ if (bops != NULL)
+ module_put(bops->owner);
+
+ bops = ops;
+ best = est;
}
+ if (bops != NULL)
+ return bops;
+
return ERR_PTR(-EOPNOTSUPP);
}
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
[NFTA_SET_DATA_TYPE] = { .type = NLA_U32 },
[NFTA_SET_DATA_LEN] = { .type = NLA_U32 },
+ [NFTA_SET_POLICY] = { .type = NLA_U32 },
+ [NFTA_SET_DESC] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
+ [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};
static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
{
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
+ struct nlattr *desc;
u32 portid = NETLINK_CB(ctx->skb).portid;
u32 seq = ctx->nlh->nlmsg_seq;
goto nla_put_failure;
}
+ desc = nla_nest_start(skb, NFTA_SET_DESC);
+ if (desc == NULL)
+ goto nla_put_failure;
+ if (set->size &&
+ nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
+ goto nla_put_failure;
+ nla_nest_end(skb, desc);
+
return nlmsg_end(skb, nlh);
nla_put_failure:
return err;
}
+static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
+ struct nft_set_desc *desc,
+ const struct nlattr *nla)
+{
+ struct nlattr *da[NFTA_SET_DESC_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy);
+ if (err < 0)
+ return err;
+
+ if (da[NFTA_SET_DESC_SIZE] != NULL)
+ desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
+
+ return 0;
+}
+
static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
char name[IFNAMSIZ];
unsigned int size;
bool create;
- u32 ktype, klen, dlen, dtype, flags;
+ u32 ktype, dtype, flags, policy;
+ struct nft_set_desc desc;
int err;
if (nla[NFTA_SET_TABLE] == NULL ||
nla[NFTA_SET_KEY_LEN] == NULL)
return -EINVAL;
+ memset(&desc, 0, sizeof(desc));
+
ktype = NFT_DATA_VALUE;
if (nla[NFTA_SET_KEY_TYPE] != NULL) {
ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
return -EINVAL;
}
- klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
- if (klen == 0 || klen > FIELD_SIZEOF(struct nft_data, data))
+ desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
+ if (desc.klen == 0 || desc.klen > FIELD_SIZEOF(struct nft_data, data))
return -EINVAL;
flags = 0;
}
dtype = 0;
- dlen = 0;
if (nla[NFTA_SET_DATA_TYPE] != NULL) {
if (!(flags & NFT_SET_MAP))
return -EINVAL;
if (dtype != NFT_DATA_VERDICT) {
if (nla[NFTA_SET_DATA_LEN] == NULL)
return -EINVAL;
- dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
- if (dlen == 0 ||
- dlen > FIELD_SIZEOF(struct nft_data, data))
+ desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
+ if (desc.dlen == 0 ||
+ desc.dlen > FIELD_SIZEOF(struct nft_data, data))
return -EINVAL;
} else
- dlen = sizeof(struct nft_data);
+ desc.dlen = sizeof(struct nft_data);
} else if (flags & NFT_SET_MAP)
return -EINVAL;
+ policy = NFT_SET_POL_PERFORMANCE;
+ if (nla[NFTA_SET_POLICY] != NULL)
+ policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+
+ if (nla[NFTA_SET_DESC] != NULL) {
+ err = nf_tables_set_desc_parse(&ctx, &desc, nla[NFTA_SET_DESC]);
+ if (err < 0)
+ return err;
+ }
+
create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
if (!(nlh->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
- ops = nft_select_set_ops(nla);
+ ops = nft_select_set_ops(nla, &desc, policy);
if (IS_ERR(ops))
return PTR_ERR(ops);
INIT_LIST_HEAD(&set->bindings);
set->ops = ops;
set->ktype = ktype;
- set->klen = klen;
+ set->klen = desc.klen;
set->dtype = dtype;
- set->dlen = dlen;
+ set->dlen = desc.dlen;
set->flags = flags;
+ set->size = desc.size;
- err = ops->init(set, nla);
+ err = ops->init(set, &desc, nla);
if (err < 0)
goto err2;
enum nft_registers dreg;
int err;
+ if (set->size && set->nelems == set->size)
+ return -ENFILE;
+
err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
nft_set_elem_policy);
if (err < 0)
err = set->ops->insert(set, &elem);
if (err < 0)
goto err3;
+ set->nelems++;
return 0;
goto err2;
set->ops->remove(set, &elem);
+ set->nelems--;
nft_data_uninit(&elem.key, NFT_DATA_VALUE);
if (set->flags & NFT_SET_MAP)
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
+#include <linux/log2.h>
#include <linux/jhash.h>
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
-#define NFT_HASH_MIN_SIZE 4
+#define NFT_HASH_MIN_SIZE 4UL
struct nft_hash {
struct nft_hash_table __rcu *tbl;
kfree(tbl);
}
+static unsigned int nft_hash_tbl_size(unsigned int nelem)
+{
+ return max(roundup_pow_of_two(nelem * 4 / 3), NFT_HASH_MIN_SIZE);
+}
+
static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
{
struct nft_hash_table *tbl;
}
static int nft_hash_init(const struct nft_set *set,
+ const struct nft_set_desc *desc,
const struct nlattr * const tb[])
{
struct nft_hash *priv = nft_set_priv(set);
struct nft_hash_table *tbl;
+ unsigned int size;
if (unlikely(!nft_hash_rnd_initted)) {
get_random_bytes(&nft_hash_rnd, 4);
nft_hash_rnd_initted = true;
}
- tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
+ size = NFT_HASH_MIN_SIZE;
+ if (desc->size)
+ size = nft_hash_tbl_size(desc->size);
+
+ tbl = nft_hash_tbl_alloc(size);
if (tbl == NULL)
return -ENOMEM;
RCU_INIT_POINTER(priv->tbl, tbl);
kfree(tbl);
}
+static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features,
+ struct nft_set_estimate *est)
+{
+ unsigned int esize;
+
+ esize = sizeof(struct nft_hash_elem);
+ if (features & NFT_SET_MAP)
+ esize += FIELD_SIZEOF(struct nft_hash_elem, data[0]);
+
+ if (desc->size) {
+ est->size = sizeof(struct nft_hash) +
+ nft_hash_tbl_size(desc->size) *
+ sizeof(struct nft_hash_elem *) +
+ desc->size * esize;
+ } else {
+ /* Resizing happens when the load drops below 30% or goes
+ * above 75%. The average of 52.5% load (approximated by 50%)
+ * is used for the size estimation of the hash buckets,
+ * meaning we calculate two buckets per element.
+ */
+ est->size = esize + 2 * sizeof(struct nft_hash_elem *);
+ }
+
+ est->class = NFT_SET_CLASS_O_1;
+
+ return true;
+}
+
static struct nft_set_ops nft_hash_ops __read_mostly = {
.privsize = nft_hash_privsize,
+ .estimate = nft_hash_estimate,
.init = nft_hash_init,
.destroy = nft_hash_destroy,
.get = nft_hash_get,