__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern void nf_ct_delete_from_lists(struct nf_conn *ct);
+extern void nf_ct_insert_dying_list(struct nf_conn *ct);
extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
struct nf_conntrack_ecache {
unsigned long cache; /* bitops want long */
+ unsigned long missed; /* missed events */
+ u32 pid; /* netlink pid of destroyer */
};
static inline struct nf_conntrack_ecache *
set_bit(event, &e->cache);
}
-static inline void
+static inline int
nf_conntrack_eventmask_report(unsigned int eventmask,
struct nf_conn *ct,
u32 pid,
int report)
{
+ int ret = 0;
struct net *net = nf_ct_net(ct);
struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
rcu_read_lock();
notify = rcu_dereference(nf_conntrack_event_cb);
if (!net->ct.sysctl_events)
goto out_unlock;
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ goto out_unlock;
+
if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
struct nf_ct_event item = {
.ct = ct,
- .pid = pid,
+ .pid = e->pid ? e->pid : pid,
.report = report
};
- notify->fcn(eventmask, &item);
+ /* This is a resent of a destroy event? If so, skip missed */
+ unsigned long missed = e->pid ? 0 : e->missed;
+
+ ret = notify->fcn(eventmask | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0) {
+ /* This is a destroy event that has been
+ * triggered by a process, we store the PID
+ * to include it in the retransmission. */
+ if (eventmask & (1 << IPCT_DESTROY) &&
+ e->pid == 0 && pid != 0)
+ e->pid = pid;
+ else
+ e->missed |= eventmask;
+ } else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+ }
}
out_unlock:
rcu_read_unlock();
+ return ret;
}
-static inline void
+static inline int
nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
u32 pid, int report)
{
- nf_conntrack_eventmask_report(1 << event, ct, pid, report);
+ return nf_conntrack_eventmask_report(1 << event, ct, pid, report);
}
-static inline void
+static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
- nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
+ return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
}
struct nf_exp_event {
static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
struct nf_conn *ct) {}
-static inline void nf_conntrack_eventmask_report(unsigned int eventmask,
- struct nf_conn *ct,
- u32 pid,
- int report) {}
-static inline void nf_conntrack_event(enum ip_conntrack_events event,
- struct nf_conn *ct) {}
-static inline void nf_conntrack_event_report(enum ip_conntrack_events event,
- struct nf_conn *ct,
- u32 pid,
- int report) {}
+static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
+ struct nf_conn *ct,
+ u32 pid,
+ int report) { return 0; }
+static inline int nf_conntrack_event(enum ip_conntrack_events event,
+ struct nf_conn *ct) { return 0; }
+static inline int nf_conntrack_event_report(enum ip_conntrack_events event,
+ struct nf_conn *ct,
+ u32 pid,
+ int report) { return 0; }
static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {}
static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event,
struct nf_conntrack_expect *exp) {}
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
+ struct hlist_nulls_head dying;
struct ip_conntrack_stat *stat;
int sysctl_events;
+ unsigned int sysctl_events_retry_timeout;
int sysctl_acct;
int sysctl_checksum;
unsigned int sysctl_log_invalid; /* Log invalid packets */
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
- if (!test_bit(IPS_DYING_BIT, &ct->status))
- nf_conntrack_event(IPCT_DESTROY, ct);
- set_bit(IPS_DYING_BIT, &ct->status);
-
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to nf_conntrack_lock!!! -HW */
nf_conntrack_free(ct);
}
-static void death_by_timeout(unsigned long ul_conntrack)
+void nf_ct_delete_from_lists(struct nf_conn *ct)
{
- struct nf_conn *ct = (void *)ul_conntrack;
struct net *net = nf_ct_net(ct);
nf_ct_helper_destroy(ct);
NF_CT_STAT_INC(net, delete_list);
clean_from_lists(ct);
spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+ struct net *net = nf_ct_net(ct);
+
+ if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+ /* bad luck, let's retry again */
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+ return;
+ }
+ /* we've got the event delivered, now it's dying */
+ set_bit(IPS_DYING_BIT, &ct->status);
+ spin_lock(&nf_conntrack_lock);
+ hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&nf_conntrack_lock);
+ nf_ct_put(ct);
+}
+
+void nf_ct_insert_dying_list(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ /* add this conntrack to the dying list */
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &net->ct.dying);
+ spin_unlock_bh(&nf_conntrack_lock);
+ /* set a new timer to retry event delivery */
+ setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+}
+EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+
+ if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+ unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+ /* destroy event was not delivered */
+ nf_ct_delete_from_lists(ct);
+ nf_ct_insert_dying_list(ct);
+ return;
+ }
+ set_bit(IPS_DYING_BIT, &ct->status);
+ nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
}
{
struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
- /* get_next_corpse sets the dying bit for us */
- nf_conntrack_event_report(IPCT_DESTROY,
- i,
- fr->pid,
- fr->report);
+ /* If we fail to deliver the event, death_by_timeout() will retry */
+ if (nf_conntrack_event_report(IPCT_DESTROY, i,
+ fr->pid, fr->report) < 0)
+ return 1;
+
+ /* Avoid the delivery of the destroy event in death_by_timeout(). */
+ set_bit(IPS_DYING_BIT, &i->status);
return 1;
}
}
EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
+static void nf_ct_release_dying_list(void)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* never fails to remove them, no listeners at this point */
+ nf_ct_kill(ct);
+ }
+ spin_unlock_bh(&nf_conntrack_lock);
+}
+
static void nf_conntrack_cleanup_init_net(void)
{
nf_conntrack_helper_fini();
{
i_see_dead_people:
nf_ct_iterate_cleanup(net, kill_all, NULL);
+ nf_ct_release_dying_list();
if (atomic_read(&net->ct.count) != 0) {
schedule();
goto i_see_dead_people;
atomic_set(&net->ct.count, 0);
INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+ INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat) {
ret = -ENOMEM;
.pid = 0,
.report = 0
};
-
- notify->fcn(events, &item);
+ int ret;
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ unsigned long missed = e->missed;
+
+ ret = notify->fcn(events | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+ }
}
out_unlock:
#define NF_CT_EVENTS_DEFAULT 1
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
#ifdef CONFIG_SYSCTL
static struct ctl_table event_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nf_conntrack_events_retry_timeout",
+ .data = &init_net.ct.sysctl_events_retry_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
{}
};
#endif /* CONFIG_SYSCTL */
goto out;
table[0].data = &net->ct.sysctl_events;
+ table[1].data = &net->ct.sysctl_events_retry_timeout;
net->ct.event_sysctl_header =
register_net_sysctl_table(net,
int ret;
net->ct.sysctl_events = nf_ct_events;
+ net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
if (net_eq(net, &init_net)) {
ret = nf_ct_extend_register(&event_extend);
struct sk_buff *skb;
unsigned int type;
unsigned int flags = 0, group;
+ int err;
/* ignore our fake conntrack entry */
if (ct == &nf_conntrack_untracked)
rcu_read_unlock();
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+ err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+ if (err == -ENOBUFS || err == -EAGAIN)
+ return -ENOBUFS;
+
return 0;
nla_put_failure:
}
}
- nf_conntrack_event_report(IPCT_DESTROY,
- ct,
- NETLINK_CB(skb).pid,
- nlmsg_report(nlh));
+ if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh)) < 0) {
+ nf_ct_delete_from_lists(ct);
+ /* we failed to report the event, try later */
+ nf_ct_insert_dying_list(ct);
+ nf_ct_put(ct);
+ return 0;
+ }
/* death_by_timeout would report the event again */
set_bit(IPS_DYING_BIT, &ct->status);