--- include/net/netfilter/nf_conntrack.h | 7 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c | 4 net/netfilter/nf_conntrack_core.c | 119 +++++++++++------- net/netfilter/nf_conntrack_netlink.c | 17 +- net/netfilter/nf_conntrack_standalone.c | 4 net/netfilter/xt_connlimit.c | 4 6 files changed, 102 insertions(+), 53 deletions(-) Index: linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_core.c =================================================================== --- linux-2.6.23-rc6.quilt.orig/net/netfilter/nf_conntrack_core.c 2007-09-16 15:22:06.000000000 +0200 +++ linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_core.c 2007-09-23 01:06:08.000000000 +0200 @@ -167,8 +167,8 @@ static void clean_from_lists(struct nf_conn *ct) { pr_debug("clean_from_lists(%p)\n", ct); - hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); - hlist_del(&ct->tuplehash[IP_CT_DIR_REPLY].hnode); + hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); + hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode); /* Destroy all pending expectations */ /* nf_ct_remove_expectations(ct); */ @@ -217,7 +217,7 @@ destroy_conntrack(struct nf_conntrack *n /* We overload first tuple to link into unconfirmed list. */ if (!nf_ct_is_confirmed(ct)) { BUG_ON(hlist_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode)); - hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); + hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); } NF_CT_STAT_INC(delete); @@ -227,7 +227,7 @@ destroy_conntrack(struct nf_conntrack *n nf_ct_put(ct->master); pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); - nf_conntrack_free(ct); + nf_conntrack_free_rcu(ct); } /* This is used to release any cached data before we suspend the conntrack @@ -273,7 +273,7 @@ __nf_conntrack_find(const struct nf_conn struct hlist_node *n; unsigned int hash = hash_conntrack(tuple); - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); @@ -292,13 +292,18 @@ nf_conntrack_find_get(const struct nf_co { struct nf_conntrack_tuple_hash *h; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); h = __nf_conntrack_find(tuple, NULL); - if (h) - atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); + /* We could possibly get rid of this atomic operation if we disable + preempt for the entire duration of all hooks but mingo would + probably have something to say about that :) -MJ */ + if (h && atomic_inc_not_zero(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use)) { + rcu_read_unlock(); + return h; + } + rcu_read_unlock(); - return h; + return NULL; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -307,9 +312,9 @@ static void __nf_conntrack_hash_insert(s unsigned int repl_hash) { ct->id = ++nf_conntrack_next_id; - hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &nf_conntrack_hash[hash]); - hlist_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, &nf_conntrack_hash[repl_hash]); } @@ -364,17 +369,18 @@ __nf_conntrack_confirm(struct sk_buff ** /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) + /* This is painful to do, especially under the lock -MJ */ + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple)) goto out; - hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[repl_hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple)) goto out; /* Remove from unconfirmed list */ - hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); + hlist_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); __nf_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original @@ -382,10 +388,15 @@ __nf_conntrack_confirm(struct sk_buff ** weird delay cases. */ ct->timeout.expires += jiffies; add_timer(&ct->timeout); - atomic_inc(&ct->ct_general.use); + if (!atomic_inc_not_zero(&ct->ct_general.use)) { + /* This shouldn't happen */ + BUG(); + } set_bit(IPS_CONFIRMED_BIT, &ct->status); NF_CT_STAT_INC(insert); + spin_unlock_bh(&nf_conntrack_lock); + help = nfct_help(ct); if (help && rcu_dereference(help->helper)) nf_conntrack_event_cache(IPCT_HELPER, ct); @@ -413,9 +424,9 @@ nf_conntrack_tuple_taken(const struct nf { struct nf_conntrack_tuple_hash *h; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); h = __nf_conntrack_find(tuple, ignored_conntrack); - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); return h != NULL; } @@ -434,9 +445,9 @@ static int early_drop(unsigned int hash) unsigned int i, cnt = 0; int dropped = 0; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) ct = tmp; @@ -446,9 +457,12 @@ static int early_drop(unsigned int hash) break; hash = (hash + 1) % nf_conntrack_htable_size; } - if (ct) - atomic_inc(&ct->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); + if (ct && !atomic_inc_not_zero(&ct->ct_general.use)) { + /* This entry is already dying */ + /* FIXME: This should continue to scan the hashtable */ + ct = NULL; + } + rcu_read_unlock(); if (!ct) return dropped; @@ -472,9 +486,11 @@ struct nf_conn *nf_conntrack_alloc(const nf_conntrack_hash_rnd_initted = 1; } + /* FIXME: make this per cpu or something like that */ /* We don't want any race condition at early drop stage */ atomic_inc(&nf_conntrack_count); + /* FIXME: Don't do an atomic read for each alloc */ if (nf_conntrack_max && atomic_read(&nf_conntrack_count) > nf_conntrack_max) { unsigned int hash = hash_conntrack(orig); @@ -495,9 +511,11 @@ struct nf_conn *nf_conntrack_alloc(const return ERR_PTR(-ENOMEM); } + /* FIXME: Absolutely no need to use an atomic set here */ atomic_set(&conntrack->ct_general.use, 1); conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + /* Don't set timer yet: wait for confirmation */ setup_timer(&conntrack->timeout, death_by_timeout, (unsigned long)conntrack); @@ -506,13 +524,26 @@ struct nf_conn *nf_conntrack_alloc(const } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); -void nf_conntrack_free(struct nf_conn *conntrack) +void nf_conntrack_free_now(struct nf_conn *conntrack) { nf_ct_ext_free(conntrack); kmem_cache_free(nf_conntrack_cachep, conntrack); + /* FIXME: This should be per cpu or something */ atomic_dec(&nf_conntrack_count); } -EXPORT_SYMBOL_GPL(nf_conntrack_free); +EXPORT_SYMBOL_GPL(nf_conntrack_free_now); + +static void nf_conntrack_free_rcu_callback(struct rcu_head *head) +{ + struct nf_conn *conntrack = container_of(head, struct nf_conn, rcu_head); + nf_conntrack_free_now(conntrack); +} + +void nf_conntrack_free_rcu(struct nf_conn *conntrack) +{ + call_rcu_bh(&conntrack->rcu_head, nf_conntrack_free_rcu_callback); +} +EXPORT_SYMBOL_GPL(nf_conntrack_free_rcu); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ @@ -540,13 +571,13 @@ init_conntrack(const struct nf_conntrack } if (!l4proto->new(conntrack, skb, dataoff)) { - nf_conntrack_free(conntrack); + nf_conntrack_free_now(conntrack); pr_debug("init conntrack: can't track with proto module\n"); return NULL; } - spin_lock_bh(&nf_conntrack_lock); - /* exp = nf_ct_find_expectation(tuple); + /* spin_lock_bh(&nf_conntrack_lock); + exp = nf_ct_find_expectation(tuple); if (exp) { struct nf_conntrack_helper *helper; @@ -576,17 +607,21 @@ init_conntrack(const struct nf_conntrack } else */ { struct nf_conntrack_helper *helper; + rcu_read_lock(); helper = nf_ct_helper_find(&repl_tuple); if (helper) { help = nf_ct_helper_ext_add(conntrack, GFP_ATOMIC); if (help) rcu_assign_pointer(help->helper, helper); } + rcu_read_unlock(); NF_CT_STAT_INC(new); } + spin_lock_bh(&nf_conntrack_lock); + /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_add_head(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].hnode, + hlist_add_head_rcu(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed); spin_unlock_bh(&nf_conntrack_lock); @@ -728,6 +763,7 @@ nf_conntrack_in(int pf, unsigned int hoo return -ret; } + /* FIXME: Should test if bit is set or not before atomic operation */ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_STATUS, ct); @@ -824,6 +860,7 @@ void __nf_ct_refresh_acct(struct nf_conn #ifdef CONFIG_NF_CT_ACCT if (do_acct) { + /* FIXME: global lock for this is painful */ spin_lock_bh(&nf_conntrack_lock); ct->counters[CTINFO2DIR(ctinfo)].packets++; ct->counters[CTINFO2DIR(ctinfo)].bytes += @@ -917,25 +954,24 @@ get_next_corpse(int (*iter)(struct nf_co struct nf_conn *ct; struct hlist_node *n; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[*bucket], hnode) { ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; + if (iter(ct, data) && + atomic_inc_not_zero(&ct->ct_general.use)) { + rcu_read_unlock(); + return ct; + } } } - hlist_for_each_entry(h, n, &unconfirmed, hnode) { + hlist_for_each_entry_rcu(h, n, &unconfirmed, hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) set_bit(IPS_DYING_BIT, &ct->status); } - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); return NULL; -found: - atomic_inc(&ct->ct_general.use); - spin_unlock_bh(&nf_conntrack_lock); - return ct; } void @@ -950,6 +986,7 @@ nf_ct_iterate_cleanup(int (*iter)(struct death_by_timeout((unsigned long)ct); /* ... else the timer will get him soon. */ + /* get_next_corpse() increases the refcount */ nf_ct_put(ct); } } @@ -1063,8 +1100,10 @@ int set_hashsize(const char *val, struct while (!hlist_empty(&nf_conntrack_hash[i])) { h = hlist_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnode); - hlist_del(&h->hnode); + hlist_del_rcu(&h->hnode); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + /* No need to use rcu alternatives here, noone can + access this new hashtable yet. */ hlist_add_head(&h->hnode, &hash[bucket]); } } Index: linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_standalone.c =================================================================== --- linux-2.6.23-rc6.quilt.orig/net/netfilter/nf_conntrack_standalone.c 2007-09-16 15:22:06.000000000 +0200 +++ linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_standalone.c 2007-09-16 15:22:06.000000000 +0200 @@ -93,7 +93,7 @@ static struct hlist_node *ct_get_idx(str static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); return ct_get_idx(seq, *pos); } @@ -105,7 +105,7 @@ static void *ct_seq_next(struct seq_file static void ct_seq_stop(struct seq_file *s, void *v) { - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); } /* return 0 on success, 1 in case of error */ Index: linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_netlink.c =================================================================== --- linux-2.6.23-rc6.quilt.orig/net/netfilter/nf_conntrack_netlink.c 2007-09-16 15:22:06.000000000 +0200 +++ linux-2.6.23-rc6.quilt/net/netfilter/nf_conntrack_netlink.c 2007-09-23 00:55:01.000000000 +0200 @@ -432,11 +432,11 @@ ctnetlink_dump_table(struct sk_buff *skb struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - hlist_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]], hnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -471,7 +471,7 @@ restart: } } out: - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); if (last) nf_ct_put(last); @@ -1001,7 +1001,7 @@ ctnetlink_create_conntrack(struct nfattr return 0; err: - nf_conntrack_free(ct); + nf_conntrack_free_now(ct); return err; } @@ -1030,21 +1030,24 @@ ctnetlink_new_conntrack(struct sock *ctn return err; } - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); if (cda[CTA_TUPLE_ORIG-1]) h = __nf_conntrack_find(&otuple, NULL); else if (cda[CTA_TUPLE_REPLY-1]) h = __nf_conntrack_find(&rtuple, NULL); if (h == NULL) { - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); return err; } + rcu_read_unlock(); + /* implicit 'else' */ + spin_lock_bh(&nf_conntrack_lock); /* We manipulate the conntrack inside the global conntrack table lock, * so there's no need to increase the refcount */ err = -EEXIST; @@ -1244,6 +1247,7 @@ ctnetlink_exp_dump_table(struct sk_buff struct hlist_node *n; u_int8_t l3proto = nfmsg->nfgen_family; + /* FIXME: Expectations are disabled and should be rcuified */ spin_lock_bh(&nf_conntrack_lock); last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { @@ -1390,6 +1394,7 @@ ctnetlink_del_expect(struct sock *ctnl, char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); struct nf_conn_help *m_help; + /* FIXME: Why do we have all this duplicate code? */ /* delete all expectations for this helper */ spin_lock_bh(&nf_conntrack_lock); h = nf_conntrack_helper_find_byname(name); Index: linux-2.6.23-rc6.quilt/net/netfilter/xt_connlimit.c =================================================================== --- linux-2.6.23-rc6.quilt.orig/net/netfilter/xt_connlimit.c 2007-09-16 15:22:06.000000000 +0200 +++ linux-2.6.23-rc6.quilt/net/netfilter/xt_connlimit.c 2007-09-16 15:22:06.000000000 +0200 @@ -119,7 +119,7 @@ static int count_them(struct xt_connlimi else hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); /* check the saved connections */ list_for_each_entry_safe(conn, tmp, hash, list) { @@ -162,7 +162,7 @@ static int count_them(struct xt_connlimi ++matches; } - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); if (addit) { /* save the new connection in our list */ Index: linux-2.6.23-rc6.quilt/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c =================================================================== --- linux-2.6.23-rc6.quilt.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c 2007-09-16 15:22:06.000000000 +0200 +++ linux-2.6.23-rc6.quilt/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c 2007-09-16 15:22:06.000000000 +0200 @@ -74,7 +74,7 @@ static struct hlist_node *ct_get_idx(str static void *ct_seq_start(struct seq_file *seq, loff_t *pos) { - spin_lock_bh(&nf_conntrack_lock); + rcu_read_lock(); return ct_get_idx(seq, *pos); } @@ -86,7 +86,7 @@ static void *ct_seq_next(struct seq_file static void ct_seq_stop(struct seq_file *s, void *v) { - spin_unlock_bh(&nf_conntrack_lock); + rcu_read_unlock(); } static int ct_seq_show(struct seq_file *s, void *v) Index: linux-2.6.23-rc6.quilt/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-2.6.23-rc6.quilt.orig/include/net/netfilter/nf_conntrack.h 2007-09-23 00:46:00.000000000 +0200 +++ linux-2.6.23-rc6.quilt/include/net/netfilter/nf_conntrack.h 2007-09-23 00:56:38.000000000 +0200 @@ -17,6 +17,7 @@ #ifdef __KERNEL__ #include #include +#include #include #include @@ -98,6 +99,9 @@ struct nf_conn plus 1 for any connection(s) we are `master' for */ struct nf_conntrack ct_general; + /* Used for deferred freeing */ + struct rcu_head rcu_head; + /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; @@ -231,7 +235,8 @@ extern int nf_ct_no_defrag; /* Iterate over all conntracks: if iter returns true, it's deleted. */ extern void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data); -extern void nf_conntrack_free(struct nf_conn *ct); +extern void nf_conntrack_free_now(struct nf_conn *ct); +extern void nf_conntrack_free_rcu(struct nf_conn *ct); extern struct nf_conn * nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl);