Use RCU to protect the l3/l4 arrays. This patch adds pointers to l3/l4 protocol handlers to 'struct nf_conn' in order to ensure that we use the same handlers for the entire duration of the conntrack entry. It also reworks nf_ct_iterate_cleanup() to wait until the to be deleted entries really are deleted, this is needed for safe protocol unregistration and avoids protocol handler refcounts. Also adds rcu_read_lock()/rcu_read_unlock() around each hook function in order to avoid grace periods in -rt kernels. Signed-off-by: Martin Josefsson --- include/net/netfilter/nf_conntrack.h | 4 include/net/netfilter/nf_conntrack_core.h | 2 include/net/netfilter/nf_conntrack_l3proto.h | 12 - include/net/netfilter/nf_conntrack_l4proto.h | 7 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 90 ++++++++-- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 87 +++++++--- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 net/netfilter/nf_conntrack_core.c | 181 +++++++++++++++------ net/netfilter/nf_conntrack_expect.c | 4 net/netfilter/nf_conntrack_netlink.c | 212 ++++++++++++++++--------- net/netfilter/nf_conntrack_proto.c | 98 ++++------- net/netfilter/nf_conntrack_standalone.c | 101 ++++++++--- net/netfilter/xt_connmark.c | 3 net/netfilter/xt_conntrack.c | 3 net/netfilter/xt_helper.c | 3 net/netfilter/xt_state.c | 3 17 files changed, 537 insertions(+), 277 deletions(-) Index: linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_l3proto.h =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/include/net/netfilter/nf_conntrack_l3proto.h 2006-11-02 19:14:48.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_l3proto.h 2006-11-02 19:15:46.000000000 +0100 @@ -89,21 +89,11 @@ extern int nf_conntrack_l3proto_register extern int nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto); extern struct nf_conntrack_l3proto * -nf_ct_l3proto_find_get(u_int16_t l3proto); - -extern void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p); +nf_ct_l3proto_find(u_int16_t l3proto); /* Existing built-in protocols */ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic; -static inline struct nf_conntrack_l3proto * -__nf_ct_l3proto_find(u_int16_t l3proto) -{ - if (unlikely(l3proto >= AF_MAX)) - return &nf_conntrack_l3proto_generic; - return nf_ct_l3protos[l3proto]; -} - #endif /*_NF_CONNTRACK_L3PROTO_H*/ Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_core.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/nf_conntrack_core.c 2006-11-02 19:15:20.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_core.c 2006-11-02 19:15:46.000000000 +0100 @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -81,6 +82,8 @@ static unsigned int nf_conntrack_next_id DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); +LIST_HEAD(nf_deleted); + /* * This scheme offers various size of "struct nf_conn" dependent on * features(helper, nat, ...) @@ -253,10 +256,13 @@ nf_ct_get_tuple(const struct sk_buff *sk const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { + int ret; + NF_CT_TUPLE_U_BLANK(tuple); tuple->src.l3num = l3num; - if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0) + ret = l3proto->pkt_to_tuple(skb, nhoff, tuple); + if (unlikely(ret == 0)) return 0; tuple->dst.protonum = protonum; @@ -271,10 +277,13 @@ nf_ct_invert_tuple(struct nf_conntrack_t const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l4proto *l4proto) { + int ret; + NF_CT_TUPLE_U_BLANK(inverse); inverse->src.l3num = orig->src.l3num; - if (l3proto->invert_tuple(inverse, orig) == 0) + ret = l3proto->invert_tuple(inverse, orig); + if (unlikely(ret == 0)) return 0; inverse->dst.dir = !orig->dst.dir; @@ -284,12 +293,19 @@ nf_ct_invert_tuple(struct nf_conntrack_t } static void -clean_from_lists(struct nf_conn *ct) +clean_from_lists(struct nf_conn *ct, struct list_head *new_list) { DEBUGP("clean_from_lists(%p)\n", ct); + + NF_CT_STAT_INC(delete_list); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); + /* Add conntrack to another list for later tracking */ + if (unlikely(new_list)) + list_add_tail(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, new_list); + /* Destroy all pending expectations */ nf_ct_remove_expectations(ct); } @@ -298,8 +314,6 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; DEBUGP("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); @@ -313,18 +327,14 @@ destroy_conntrack(struct nf_conntrack *n nf_ct_deliver_cached_events(ct, 0); local_bh_enable(); - set_bit(IPS_DYING_BIT, &ct->status); - /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ - l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num); - if (l3proto && l3proto->destroy) - l3proto->destroy(ct); - - l4proto = __nf_ct_l4proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); - if (l4proto && l4proto->destroy) - l4proto->destroy(ct); + if (unlikely(ct->l3proto->destroy)) + ct->l3proto->destroy(ct); + + if (unlikely(ct->l4proto->destroy)) + ct->l4proto->destroy(ct); if (nf_conntrack_destroyed) nf_conntrack_destroyed(ct); @@ -336,11 +346,11 @@ destroy_conntrack(struct nf_conntrack *n * too. */ nf_ct_remove_expectations(ct); - /* We overload first tuple to link into unconfirmed list. */ - if (!nf_ct_is_confirmed(ct)) { - BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + /* We overload first tuple to link into unconfirmed or deleted list. + list_empty() only works on the list head so instead we use this + ugly ugly hack */ + if (ct->tuplehash[IP_CT_DIR_ORIGINAL].list.prev != LIST_POISON2) list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - } NF_CT_STAT_INC(delete); spin_unlock_bh(&nf_conntrack_lock); @@ -370,8 +380,8 @@ static void death_by_timeout(unsigned lo spin_lock_bh(&nf_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(delete_list); - clean_from_lists(ct); + set_bit(IPS_DYING_BIT, &ct->status); + clean_from_lists(ct, NULL); spin_unlock_bh(&nf_conntrack_lock); nf_ct_put(ct); } @@ -468,6 +478,12 @@ __nf_conntrack_confirm(struct sk_buff ** spin_lock_bh(&nf_conntrack_lock); + /* Abort if this entry is dying, manual removal can cause this. */ + if (unlikely(test_bit(IPS_DYING_BIT, &ct->status))) { + spin_unlock_bh(&nf_conntrack_lock); + return NF_DROP; + } + /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ @@ -561,7 +577,8 @@ static int early_drop(struct list_head * static struct nf_conn * __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, - const struct nf_conntrack_l3proto *l3proto) + struct nf_conntrack_l3proto *l3proto, + struct nf_conntrack_l4proto *l4proto) { struct nf_conn *conntrack = NULL; u_int32_t features = 0; @@ -632,6 +649,14 @@ __nf_conntrack_alloc(const struct nf_con init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; + + /* These are the l3/l4 handlers this entry is going to use, + we are not allowed to look them up again due to possible + synchronization issues with registration / unregistration + of protocol handlers */ + conntrack->l3proto = l3proto; + conntrack->l4proto = l4proto; + read_unlock_bh(&nf_ct_cache_lock); return conntrack; @@ -645,9 +670,12 @@ struct nf_conn *nf_conntrack_alloc(const const struct nf_conntrack_tuple *repl) { struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + + l3proto = nf_ct_l3proto_find(orig->src.l3num); + l4proto = nf_ct_l4proto_find(orig->src.l3num, orig->dst.protonum); - l3proto = __nf_ct_l3proto_find(orig->src.l3num); - return __nf_conntrack_alloc(orig, repl, l3proto); + return __nf_conntrack_alloc(orig, repl, l3proto, l4proto); } void nf_conntrack_free(struct nf_conn *conntrack) @@ -678,13 +706,13 @@ init_conntrack(const struct nf_conntrack return NULL; } - conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto); + conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto, l4proto); if (conntrack == NULL || IS_ERR(conntrack)) { DEBUGP("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)conntrack; } - if (!l4proto->new(conntrack, skb, dataoff)) { + if (!conntrack->l4proto->new(conntrack, skb, dataoff)) { nf_conntrack_free(conntrack); DEBUGP("init conntrack: can't track with proto module\n"); return NULL; @@ -799,22 +827,28 @@ nf_conntrack_in(int pf, unsigned int hoo return NF_ACCEPT; } - l3proto = __nf_ct_l3proto_find((u_int16_t)pf); - if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) { + /* nf_ct_l3proto_find() always returns a valid l3proto */ + l3proto = nf_ct_l3proto_find((u_int16_t)pf); + + ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum); + if (unlikely(ret <= 0)) { DEBUGP("not prepared to track yet or error occured\n"); return -ret; } - l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum); + /* nf_ct_l4proto_find() always returns a valid l4proto */ + l4proto = nf_ct_l4proto_find((u_int16_t)pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter * core what to do with the packet. */ - if (l4proto->error != NULL && - (ret = l4proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) { - NF_CT_STAT_INC(error); - NF_CT_STAT_INC(invalid); - return -ret; + if (likely(l4proto->error)) { + ret = l4proto->error(*pskb, dataoff, &ctinfo, pf, hooknum); + if (unlikely(ret <= 0)) { + NF_CT_STAT_INC(error); + NF_CT_STAT_INC(invalid); + return -ret; + } } ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, l4proto, @@ -833,7 +867,7 @@ nf_conntrack_in(int pf, unsigned int hoo NF_CT_ASSERT((*pskb)->nfct); - ret = l4proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); + ret = ct->l4proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum); if (ret < 0) { /* Invalid: inverse of the return code tells * the netfilter core what to do */ @@ -853,10 +887,18 @@ nf_conntrack_in(int pf, unsigned int hoo int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig) { - return nf_ct_invert_tuple(inverse, orig, - __nf_ct_l3proto_find(orig->src.l3num), - __nf_ct_l4proto_find(orig->src.l3num, - orig->dst.protonum)); + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + int ret; + + rcu_read_lock(); + l3proto = nf_ct_l3proto_find(orig->src.l3num); + l4proto = nf_ct_l4proto_find(orig->src.l3num, orig->dst.protonum); + + ret = nf_ct_invert_tuple(inverse, orig, l3proto, l4proto); + rcu_read_unlock(); + + return ret; } /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ @@ -1013,6 +1055,8 @@ found: return ct; } +/* Loop over all entries matching iter() and slay them, wait until they are + dead */ void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) { @@ -1020,13 +1064,47 @@ nf_ct_iterate_cleanup(int (*iter)(struct unsigned int bucket = 0; while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - death_by_timeout((unsigned long)ct); - /* ... else the timer will get him soon. */ + if (nf_ct_is_confirmed(ct)) { + /* Time to push up daises... and move to the delete list */ + if (del_timer(&ct->timeout)) { + spin_lock_bh(&nf_conntrack_lock); + set_bit(IPS_DYING_BIT, &ct->status); + clean_from_lists(ct, &nf_deleted); + spin_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); + } + /* We lost the race, the refcount should get it soon. */ + } else { + /* Move the unconfirmed conntracks from the unconfirmed + list to the delete list */ + spin_lock_bh(&nf_conntrack_lock); + set_bit(IPS_DYING_BIT, &ct->status); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_add_tail(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list, &nf_deleted); + spin_unlock_bh(&nf_conntrack_lock); + } + /* get_next_corpse() increases the refcount */ nf_ct_put(ct); } + +i_see_dead_people: + /* Wait for all entries in the delete list to vanish due to refcount + dropping to zero and ->destroy() beeing called. This should also + catch the ones that we lost the del_timer() race for. */ + synchronize_net(); + + /* We wait here for all the entries to go away because it's better to + be safe than sorry. The synchronize_net() above should make sure + that noone is poking around in whatever we are unregistering / + doing funky stuff with, but wait anyway. */ + spin_lock_bh(&nf_conntrack_lock); + if (!list_empty(&nf_deleted)) { + spin_unlock_bh(&nf_conntrack_lock); + schedule(); + goto i_see_dead_people; + } + spin_unlock_bh(&nf_conntrack_lock); } static int kill_all(struct nf_conn *i, void *data) @@ -1057,16 +1135,18 @@ void nf_conntrack_cleanup(void) ip_ct_attach = NULL; /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module + netfilter framework. Roll on, multi-stage module delete... */ synchronize_net(); - i_see_dead_people: +i_see_very_dead_people: nf_conntrack_flush(); + if (atomic_read(&nf_conntrack_count) != 0) { schedule(); - goto i_see_dead_people; + goto i_see_very_dead_people; } + /* wait until all references to nf_conntrack_untracked are dropped */ while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) schedule(); @@ -1085,9 +1165,11 @@ void nf_conntrack_cleanup(void) /* free l3proto protocol tables */ for (i = 0; i < PF_MAX; i++) - if (nf_ct_protos[i]) { - kfree(nf_ct_protos[i]); - nf_ct_protos[i] = NULL; + /* rcu_dereference() / rcu_assign_pointer() aren't really + needed here but use them for the sake of clarity. */ + if (rcu_dereference(nf_ct_protos[i])) { + kfree(rcu_dereference(nf_ct_protos[i])); + rcu_assign_pointer(nf_ct_protos[i], NULL); } } @@ -1211,7 +1293,8 @@ int __init nf_conntrack_init(void) /* Don't NEED lock here, but good form anyway. */ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < AF_MAX; i++) - nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic; + rcu_assign_pointer(nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); spin_unlock_bh(&nf_conntrack_lock); /* For use by REJECT target */ Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_standalone.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/nf_conntrack_standalone.c 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_standalone.c 2006-11-02 19:15:46.000000000 +0100 @@ -131,8 +131,6 @@ static int ct_seq_show(struct seq_file * { const struct nf_conntrack_tuple_hash *hash = v; const struct nf_conn *conntrack = nf_ct_tuplehash_to_ctrack(hash); - struct nf_conntrack_l3proto *l3proto; - struct nf_conntrack_l4proto *l4proto; NF_CT_ASSERT(conntrack); @@ -140,33 +138,23 @@ static int ct_seq_show(struct seq_file * if (NF_CT_DIRECTION(hash)) return 0; - l3proto = __nf_ct_l3proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num); - - NF_CT_ASSERT(l3proto); - l4proto = __nf_ct_l4proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.src.l3num, - conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); - NF_CT_ASSERT(proto); - if (seq_printf(s, "%-8s %u %-8s %u %ld ", - l3proto->name, + conntrack->l3proto->name, conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, - l4proto->name, + conntrack->l4proto->name, conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, timer_pending(&conntrack->timeout) ? (long)(conntrack->timeout.expires - jiffies)/HZ : 0) != 0) return -ENOSPC; - if (l3proto->print_conntrack(s, conntrack)) + if (conntrack->l3proto->print_conntrack(s, conntrack)) return -ENOSPC; - if (l4proto->print_conntrack(s, conntrack)) + if (conntrack->l4proto->print_conntrack(s, conntrack)) return -ENOSPC; if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto)) + conntrack->l3proto, conntrack->l4proto)) return -ENOSPC; if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) @@ -177,7 +165,7 @@ static int ct_seq_show(struct seq_file * return -ENOSPC; if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto)) + conntrack->l3proto, conntrack->l4proto)) return -ENOSPC; if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) @@ -239,6 +227,64 @@ static struct file_operations ct_file_op .release = seq_release_private, }; +static void *deleted_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &nf_deleted; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + spin_lock_bh(&nf_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &nf_deleted) + return NULL; + } + return e; +} + +static void *deleted_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + ++*pos; + e = e->next; + + if (e == &nf_deleted) + return NULL; + + return e; +} + +static void deleted_seq_stop(struct seq_file *s, void *v) +{ + spin_unlock_bh(&nf_conntrack_lock); +} + +static struct seq_operations deleted_seq_ops = { + .start = deleted_seq_start, + .next = deleted_seq_next, + .stop = deleted_seq_stop, + .show = ct_seq_show +}; + +static int deleted_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &deleted_seq_ops); +} + +struct file_operations deleted_file_ops = { + .owner = THIS_MODULE, + .open = deleted_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; @@ -568,7 +614,7 @@ EXPORT_SYMBOL(nf_ct_log_invalid); static int __init nf_conntrack_standalone_init(void) { #ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc, *proc_exp, *proc_stat; + struct proc_dir_entry *proc, *proc_exp, *proc_stat, *proc_deleted; #endif int ret = 0; @@ -584,9 +630,13 @@ static int __init nf_conntrack_standalon &exp_file_ops); if (!proc_exp) goto cleanup_proc; + proc_deleted = proc_net_fops_create("nf_conntrack_deleted", 0440, + &deleted_file_ops); + if (!proc_deleted) goto cleanup_proc_exp; + proc_stat = create_proc_entry("nf_conntrack", S_IRUGO, proc_net_stat); if (!proc_stat) - goto cleanup_proc_exp; + goto cleanup_proc_deleted; proc_stat->proc_fops = &ct_cpu_seq_fops; proc_stat->owner = THIS_MODULE; @@ -606,6 +656,8 @@ static int __init nf_conntrack_standalon #endif #ifdef CONFIG_PROC_FS remove_proc_entry("nf_conntrack", proc_net_stat); + cleanup_proc_deleted: + proc_net_remove("nf_conntrack_deleted"); cleanup_proc_exp: proc_net_remove("nf_conntrack_expect"); cleanup_proc: @@ -623,6 +675,7 @@ static void __exit nf_conntrack_standalo #endif #ifdef CONFIG_PROC_FS remove_proc_entry("nf_conntrack", proc_net_stat); + proc_net_remove("nf_conntrack_deleted"); proc_net_remove("nf_conntrack_expect"); proc_net_remove("nf_conntrack"); #endif /* CNFIG_PROC_FS */ @@ -647,7 +700,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_e EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); #endif EXPORT_SYMBOL(nf_ct_l3proto_try_module_get); -EXPORT_SYMBOL(nf_ct_l3proto_module_put); EXPORT_SYMBOL(nf_conntrack_l3proto_register); EXPORT_SYMBOL(nf_conntrack_l3proto_unregister); EXPORT_SYMBOL(nf_conntrack_l4proto_register); @@ -660,11 +712,8 @@ EXPORT_SYMBOL(nf_conntrack_helper_unregi EXPORT_SYMBOL(nf_ct_iterate_cleanup); EXPORT_SYMBOL(__nf_ct_refresh_acct); EXPORT_SYMBOL(nf_ct_protos); -EXPORT_SYMBOL(__nf_ct_l4proto_find); -EXPORT_SYMBOL(nf_ct_l4proto_find_get); -EXPORT_SYMBOL(nf_ct_l4proto_put); -EXPORT_SYMBOL(nf_ct_l3proto_find_get); -EXPORT_SYMBOL(nf_ct_l3proto_put); +EXPORT_SYMBOL(nf_ct_l4proto_find); +EXPORT_SYMBOL(nf_ct_l3proto_find); EXPORT_SYMBOL(nf_ct_l3protos); EXPORT_SYMBOL_GPL(nf_conntrack_checksum); EXPORT_SYMBOL(nf_conntrack_expect_alloc); Index: linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack.h =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/include/net/netfilter/nf_conntrack.h 2006-11-02 19:14:32.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack.h 2006-11-02 19:15:46.000000000 +0100 @@ -103,6 +103,10 @@ struct nf_conn struct ip_conntrack_counter counters[IP_CT_DIR_MAX]; #endif + /* Protocol handlers for this entry */ + struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; + /* Unique ID that identifies this conntrack*/ unsigned int id; Index: linux-2.6.19-rc3-git4.quilt/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-11-02 19:14:44.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-11-02 19:15:46.000000000 +0100 @@ -128,8 +128,17 @@ static unsigned int ipv4_confirm(unsigne const struct net_device *out, int (*okfn)(struct sk_buff *)) { + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(pskb); + ret = nf_conntrack_confirm(pskb); + + rcu_read_unlock(); + + return ret; } static unsigned int ipv4_conntrack_help(unsigned int hooknum, @@ -141,20 +150,33 @@ static unsigned int ipv4_conntrack_help( struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) - return NF_ACCEPT; + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) { + ret = NF_ACCEPT; + goto out; + } help = nfct_help(ct); - if (!help || !help->helper) - return NF_ACCEPT; + if (!help || !help->helper) { + ret = NF_ACCEPT; + goto out; + } + + ret = help->helper->help(pskb, + (*pskb)->nh.raw - (*pskb)->data + + (*pskb)->nh.iph->ihl*4, + ct, ctinfo); - return help->helper->help(pskb, - (*pskb)->nh.raw - (*pskb)->data - + (*pskb)->nh.iph->ihl*4, - ct, ctinfo); +out: + rcu_read_unlock(); + + return ret; } static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, @@ -163,11 +185,18 @@ static unsigned int ipv4_conntrack_defra const struct net_device *out, int (*okfn)(struct sk_buff *)) { + unsigned int ret = NF_ACCEPT; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + #if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if ((*pskb)->nfct) - return NF_ACCEPT; + if ((*pskb)->nfct) { + ret = NF_ACCEPT; + goto out; + } #endif /* Gather fragments. */ @@ -176,10 +205,16 @@ static unsigned int ipv4_conntrack_defra hooknum == NF_IP_PRE_ROUTING ? IP_DEFRAG_CONNTRACK_IN : IP_DEFRAG_CONNTRACK_OUT); - if (!*pskb) - return NF_STOLEN; + if (!*pskb) { + ret = NF_STOLEN; + goto out; + } } - return NF_ACCEPT; + +out: + rcu_read_unlock(); + + return ret; } static unsigned int ipv4_conntrack_in(unsigned int hooknum, @@ -188,7 +223,16 @@ static unsigned int ipv4_conntrack_in(un const struct net_device *out, int (*okfn)(struct sk_buff *)) { - return nf_conntrack_in(PF_INET, hooknum, pskb); + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + + ret = nf_conntrack_in(PF_INET, hooknum, pskb); + + rcu_read_unlock(); + + return ret; } static unsigned int ipv4_conntrack_local(unsigned int hooknum, @@ -197,14 +241,26 @@ static unsigned int ipv4_conntrack_local const struct net_device *out, int (*okfn)(struct sk_buff *)) { + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct iphdr) || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { if (net_ratelimit()) printk("ipt_hook: happy cracking.\n"); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } - return nf_conntrack_in(PF_INET, hooknum, pskb); + + ret = nf_conntrack_in(PF_INET, hooknum, pskb); + +out: + rcu_read_unlock(); + + return ret; } /* Connection tracking may drop packets, but never alters them, so Index: linux-2.6.19-rc3-git4.quilt/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2006-11-02 19:14:44.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c 2006-11-02 19:15:46.000000000 +0100 @@ -180,35 +180,47 @@ static unsigned int ipv6_confirm(unsigne struct nf_conn *ct; struct nf_conn_help *help; enum ip_conntrack_info ctinfo; - unsigned int ret, protoff; + unsigned int ret = NF_ACCEPT, protoff; unsigned int extoff = (u8*)((*pskb)->nh.ipv6h + 1) - (*pskb)->data; unsigned char pnum = (*pskb)->nh.ipv6h->nexthdr; + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + if (unlikely(!ct)) goto out; + if (ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + goto confirm; help = nfct_help(ct); if (!help || !help->helper) - goto out; + goto confirm; protoff = nf_ct_ipv6_skip_exthdr(*pskb, extoff, &pnum, (*pskb)->len - extoff); if (protoff < 0 || protoff > (*pskb)->len || pnum == NEXTHDR_FRAGMENT) { DEBUGP("proto header not found\n"); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } ret = help->helper->help(pskb, protoff, ct, ctinfo); if (ret != NF_ACCEPT) - return ret; -out: + goto out; + +confirm: /* We've seen it coming out the other side: confirm it */ - return nf_conntrack_confirm(pskb); + ret = nf_conntrack_confirm(pskb); + +out: + rcu_read_unlock(); + + return ret; } extern struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb); @@ -223,25 +235,38 @@ static unsigned int ipv6_defrag(unsigned int (*okfn)(struct sk_buff *)) { struct sk_buff *reasm; + unsigned int ret = NF_STOLEN; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); /* Previously seen (loopback)? */ - if ((*pskb)->nfct) - return NF_ACCEPT; + if ((*pskb)->nfct) { + ret = NF_ACCEPT; + goto out; + } reasm = nf_ct_frag6_gather(*pskb); /* queued */ - if (reasm == NULL) - return NF_STOLEN; + if (reasm == NULL) { + ret = NF_STOLEN; + goto out; + } /* error occured or not fragmented */ - if (reasm == *pskb) - return NF_ACCEPT; + if (reasm == *pskb) { + ret = NF_ACCEPT; + goto out; + } nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, (struct net_device *)out, okfn); - return NF_STOLEN; +out: + rcu_read_unlock(); + + return ret; } static unsigned int ipv6_conntrack_in(unsigned int hooknum, @@ -251,23 +276,31 @@ static unsigned int ipv6_conntrack_in(un int (*okfn)(struct sk_buff *)) { struct sk_buff *reasm = (*pskb)->nfct_reasm; + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); /* This packet is fragmented and has reassembled packet. */ if (reasm) { /* Reassembled packet isn't parsed yet ? */ if (!reasm->nfct) { - unsigned int ret; - ret = nf_conntrack_in(PF_INET6, hooknum, &reasm); if (ret != NF_ACCEPT) - return ret; + goto out; } nf_conntrack_get(reasm->nfct); (*pskb)->nfct = reasm->nfct; - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } - return nf_conntrack_in(PF_INET6, hooknum, pskb); + ret = nf_conntrack_in(PF_INET6, hooknum, pskb); + +out: + rcu_read_unlock(); + + return ret; } static unsigned int ipv6_conntrack_local(unsigned int hooknum, @@ -276,13 +309,25 @@ static unsigned int ipv6_conntrack_local const struct net_device *out, int (*okfn)(struct sk_buff *)) { + unsigned int ret; + + /* Prevent grace periods during execution of this hook function */ + rcu_read_lock(); + /* root is playing with raw sockets. */ if ((*pskb)->len < sizeof(struct ipv6hdr)) { if (net_ratelimit()) printk("ipv6_conntrack_local: packet too short\n"); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } - return ipv6_conntrack_in(hooknum, pskb, in, out, okfn); + + ret = ipv6_conntrack_in(hooknum, pskb, in, out, okfn); + +out: + rcu_read_unlock(); + + return ret; } static struct nf_hook_ops ipv6_conntrack_ops[] = { Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_proto.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/nf_conntrack_proto.c 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_proto.c 2006-11-02 19:15:47.000000000 +0100 @@ -30,53 +30,41 @@ struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; +/* Must hold rcu_read_lock() when calling this function. This is guaranteed + to always return a valid protocol helper, since it falls back to + generic_protocol */ struct nf_conntrack_l4proto * -__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) +nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) { - if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) - return &nf_conntrack_l4proto_generic; + struct nf_conntrack_l4proto **arrp; + struct nf_conntrack_l4proto *l4p = &nf_conntrack_l4proto_generic; - return nf_ct_protos[l3proto][l4proto]; -} - -/* this is guaranteed to always return a valid protocol helper, since - * it falls back to generic_protocol */ -struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto) -{ - struct nf_conntrack_l4proto *p; + if (unlikely(l3proto >= AF_MAX)) + goto out; - preempt_disable(); - p = __nf_ct_l4proto_find(l3proto, l4proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l4proto_generic; - preempt_enable(); + arrp = rcu_dereference(nf_ct_protos[l3proto]); + if (unlikely(arrp == NULL)) + goto out; - return p; -} + l4p = rcu_dereference(arrp[l4proto]); -void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p) -{ - module_put(p->me); +out: + return l4p; } +/* Must hold rcu_read_lock() when calling this function */ struct nf_conntrack_l3proto * -nf_ct_l3proto_find_get(u_int16_t l3proto) +nf_ct_l3proto_find(u_int16_t l3proto) { - struct nf_conntrack_l3proto *p; + struct nf_conntrack_l3proto *p = &nf_conntrack_l3proto_generic; - preempt_disable(); - p = __nf_ct_l3proto_find(l3proto); - if (!try_module_get(p->me)) - p = &nf_conntrack_l3proto_generic; - preempt_enable(); + if (unlikely(l3proto >= AF_MAX)) + goto out; - return p; -} + p = rcu_dereference(nf_ct_l3protos[l3proto]); -void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p) -{ - module_put(p->me); +out: + return p; } int @@ -85,7 +73,7 @@ nf_ct_l3proto_try_module_get(unsigned sh int ret; struct nf_conntrack_l3proto *p; -retry: p = nf_ct_l3proto_find_get(l3proto); +retry: p = nf_ct_l3proto_find(l3proto); if (p == &nf_conntrack_l3proto_generic) { ret = request_module("nf_conntrack-%d", l3proto); if (!ret) @@ -97,17 +85,6 @@ retry: p = nf_ct_l3proto_find_get(l3prot return 0; } -void nf_ct_l3proto_module_put(unsigned short l3proto) -{ - struct nf_conntrack_l3proto *p; - - preempt_disable(); - p = __nf_ct_l3proto_find(l3proto); - preempt_enable(); - - module_put(p->me); -} - static int kill_l3proto(struct nf_conn *i, void *data) { return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num == @@ -134,11 +111,12 @@ int nf_conntrack_l3proto_register(struct } spin_lock_bh(&nf_conntrack_lock); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + if (rcu_dereference(nf_ct_l3protos[proto->l3proto]) + != &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } - nf_ct_l3protos[proto->l3proto] = proto; + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); out_unlock: spin_unlock_bh(&nf_conntrack_lock); @@ -156,13 +134,14 @@ int nf_conntrack_l3proto_unregister(stru } spin_lock_bh(&nf_conntrack_lock); - if (nf_ct_l3protos[proto->l3proto] != proto) { + if (rcu_dereference(nf_ct_l3protos[proto->l3proto]) != proto) { spin_unlock_bh(&nf_conntrack_lock); ret = -EBUSY; goto out; } - nf_ct_l3protos[proto->l3proto] = &nf_conntrack_l3proto_generic; + rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], + &nf_conntrack_l3proto_generic); spin_unlock_bh(&nf_conntrack_lock); /* Somebody could be still looking at the proto in bh. */ @@ -188,8 +167,8 @@ int nf_conntrack_l4proto_register(struct retry: spin_lock_bh(&nf_conntrack_lock); - if (nf_ct_protos[l4proto->l3proto]) { - if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] + if (rcu_dereference(nf_ct_protos[l4proto->l3proto])) { + if (rcu_dereference(nf_ct_protos[l4proto->l3proto][l4proto->l4proto]) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; @@ -210,7 +189,8 @@ retry: goto out; } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; + rcu_assign_pointer(proto_array[i], + &nf_conntrack_l4proto_generic); spin_lock_bh(&nf_conntrack_lock); if (nf_ct_protos[l4proto->l3proto]) { @@ -218,7 +198,8 @@ retry: spin_unlock_bh(&nf_conntrack_lock); kfree(proto_array); } else { - nf_ct_protos[l4proto->l3proto] = proto_array; + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto], + proto_array); spin_unlock_bh(&nf_conntrack_lock); } @@ -229,7 +210,8 @@ retry: goto retry; } - nf_ct_protos[l4proto->l3proto][l4proto->l4proto] = l4proto; + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + l4proto); out_unlock: spin_unlock_bh(&nf_conntrack_lock); @@ -247,14 +229,14 @@ int nf_conntrack_l4proto_unregister(stru } spin_lock_bh(&nf_conntrack_lock); - if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] + if (rcu_dereference(nf_ct_protos[l4proto->l3proto][l4proto->l4proto]) != l4proto) { spin_unlock_bh(&nf_conntrack_lock); ret = -EBUSY; goto out; } - nf_ct_protos[l4proto->l3proto][l4proto->l4proto] - = &nf_conntrack_l4proto_generic; + rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + &nf_conntrack_l4proto_generic); spin_unlock_bh(&nf_conntrack_lock); /* Somebody could be still looking at the proto in bh. */ Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_netlink.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/nf_conntrack_netlink.c 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_netlink.c 2006-11-02 19:15:47.000000000 +0100 @@ -48,6 +48,8 @@ MODULE_LICENSE("GPL"); static char __initdata version[] = "0.93"; +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + static inline int ctnetlink_dump_tuples_proto(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, @@ -96,16 +98,30 @@ ctnetlink_dump_tuples(struct sk_buff *sk struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + l3proto = nf_ct_l3proto_find(tuple->src.l3num); ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto); - nf_ct_l3proto_put(l3proto); if (unlikely(ret < 0)) return ret; - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, tuple, l4proto); - nf_ct_l4proto_put(l4proto); + + return ret; +} + +static inline int +ctnetlink_dump_tuples_ct(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_dir dir) +{ + int ret; + + ret = ctnetlink_dump_tuples_ip(skb, tuple(ct, dir), ct->l3proto); + + if (unlikely(ret < 0)) + return ret; + + ret = ctnetlink_dump_tuples_proto(skb, tuple(ct, dir), ct->l4proto); return ret; } @@ -142,20 +158,16 @@ nfattr_failure: static inline int ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct) { - struct nf_conntrack_l4proto *l4proto = nf_ct_l4proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); struct nfattr *nest_proto; int ret; - if (!l4proto->to_nfattr) { - nf_ct_l4proto_put(l4proto); + if (!ct->l4proto->to_nfattr) { return 0; } nest_proto = NFA_NEST(skb, CTA_PROTOINFO); - ret = l4proto->to_nfattr(skb, nest_proto, ct); - - nf_ct_l4proto_put(l4proto); + ret = ct->l4proto->to_nfattr(skb, nest_proto, ct); NFA_NEST_END(skb, nest_proto); @@ -253,12 +265,10 @@ nfattr_failure: return -1; } -#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) - static int ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, - const struct nf_conn *ct) + struct nf_conn *ct) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -278,12 +288,12 @@ ctnetlink_fill_info(struct sk_buff *skb, nfmsg->res_id = 0; nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + if (ctnetlink_dump_tuples_ct(skb, ct, IP_CT_DIR_ORIGINAL) < 0) goto nfattr_failure; NFA_NEST_END(skb, nest_parms); nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + if (ctnetlink_dump_tuples_ct(skb, ct, IP_CT_DIR_REPLY) < 0) goto nfattr_failure; NFA_NEST_END(skb, nest_parms); @@ -358,12 +368,12 @@ static int ctnetlink_conntrack_event(str nfmsg->res_id = 0; nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + if (ctnetlink_dump_tuples_ct(skb, ct, IP_CT_DIR_ORIGINAL) < 0) goto nfattr_failure; NFA_NEST_END(skb, nest_parms); nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); - if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + if (ctnetlink_dump_tuples_ct(skb, ct, IP_CT_DIR_REPLY) < 0) goto nfattr_failure; NFA_NEST_END(skb, nest_parms); @@ -473,13 +483,11 @@ ctnetlink_parse_tuple_ip(struct nfattr * nfattr_parse_nested(tb, CTA_IP_MAX, attr); - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + l3proto = nf_ct_l3proto_find(tuple->src.l3num); if (likely(l3proto->nfattr_to_tuple)) ret = l3proto->nfattr_to_tuple(tb, tuple); - nf_ct_l3proto_put(l3proto); - return ret; } @@ -504,13 +512,11 @@ ctnetlink_parse_tuple_proto(struct nfatt return -EINVAL; tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); if (likely(l4proto->nfattr_to_tuple)) ret = l4proto->nfattr_to_tuple(tb, tuple); - nf_ct_l4proto_put(l4proto); - return ret; } @@ -662,6 +668,9 @@ ctnetlink_del_conntrack(struct sock *ctn if (nfattr_bad_size(cda, CTA_MAX, cta_min)) return -EINVAL; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); else if (cda[CTA_TUPLE_REPLY-1]) @@ -669,15 +678,18 @@ ctnetlink_del_conntrack(struct sock *ctn else { /* Flush the whole table */ nf_conntrack_flush(); - return 0; + err = 0; + goto out_unlock_rcu; } if (err < 0) - return err; + goto out_unlock_rcu; h = nf_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; + if (!h) { + err = -ENOENT; + goto out_unlock_rcu; + } ct = nf_ct_tuplehash_to_ctrack(h); @@ -685,7 +697,8 @@ ctnetlink_del_conntrack(struct sock *ctn u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); if (ct->id != id) { nf_ct_put(ct); - return -ENOENT; + err = -ENOENT; + goto out_unlock_rcu; } } if (del_timer(&ct->timeout)) @@ -693,7 +706,11 @@ ctnetlink_del_conntrack(struct sock *ctn nf_ct_put(ct); - return 0; + err = 0; + +out_unlock_rcu: + rcu_read_unlock_bh(); + return err; } static int @@ -708,41 +725,55 @@ ctnetlink_get_conntrack(struct sock *ctn u_int8_t u3 = nfmsg->nfgen_family; int err = 0; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + if (nlh->nlmsg_flags & NLM_F_DUMP) { u32 rlen; #ifndef CONFIG_NF_CT_ACCT - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) - return -ENOTSUPP; + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == IPCTNL_MSG_CT_GET_CTRZERO) { + err = -ENOTSUPP; + goto out_unlock_rcu; + } #endif if ((*errp = netlink_dump_start(ctnl, skb, nlh, ctnetlink_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; + ctnetlink_done)) != 0) { + err = -EINVAL; + goto out_unlock_rcu; + } rlen = NLMSG_ALIGN(nlh->nlmsg_len); if (rlen > skb->len) rlen = skb->len; skb_pull(skb, rlen); - return 0; + err = 0; + goto out_unlock_rcu; } - if (nfattr_bad_size(cda, CTA_MAX, cta_min)) - return -EINVAL; + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) { + err = -EINVAL; + goto out_unlock_rcu; + } if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); else if (cda[CTA_TUPLE_REPLY-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); - else - return -EINVAL; + else { + err = -EINVAL; + goto out_unlock_rcu; + } if (err < 0) - return err; + goto out_unlock_rcu; h = nf_conntrack_find_get(&tuple, NULL); - if (!h) - return -ENOENT; + if (!h) { + err = -ENOENT; + goto out_unlock_rcu; + } ct = nf_ct_tuplehash_to_ctrack(h); @@ -750,7 +781,8 @@ ctnetlink_get_conntrack(struct sock *ctn skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) { nf_ct_put(ct); - return -ENOMEM; + err = -ENOMEM; + goto out_unlock_rcu; } NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; @@ -762,13 +794,15 @@ ctnetlink_get_conntrack(struct sock *ctn err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); if (err < 0) - goto out; + goto out_unlock_rcu; - return 0; + err = 0; + goto out_unlock_rcu; free: kfree_skb(skb2); -out: +out_unlock_rcu: + rcu_read_unlock_bh(); return err; } @@ -897,11 +931,10 @@ ctnetlink_change_protoinfo(struct nf_con nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); - l4proto = nf_ct_l4proto_find_get(l3num, npt); + l4proto = nf_ct_l4proto_find(l3num, npt); if (l4proto->from_nfattr) err = l4proto->from_nfattr(tb, ct); - nf_ct_l4proto_put(l4proto); return err; } @@ -1000,16 +1033,19 @@ ctnetlink_new_conntrack(struct sock *ctn if (nfattr_bad_size(cda, CTA_MAX, cta_min)) return -EINVAL; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); if (err < 0) - return err; + goto out_unlock_rcu; } if (cda[CTA_TUPLE_REPLY-1]) { err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); if (err < 0) - return err; + goto out_unlock_rcu; } spin_lock_bh(&nf_conntrack_lock); @@ -1023,7 +1059,7 @@ ctnetlink_new_conntrack(struct sock *ctn err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); - return err; + goto out_unlock_rcu; } /* implicit 'else' */ @@ -1041,6 +1077,8 @@ ctnetlink_new_conntrack(struct sock *ctn out_unlock: spin_unlock_bh(&nf_conntrack_lock); +out_unlock_rcu: + rcu_read_unlock_bh(); return err; } @@ -1076,16 +1114,14 @@ ctnetlink_exp_dump_mask(struct sk_buff * struct nf_conntrack_l4proto *l4proto; struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT_MASK); - l3proto = nf_ct_l3proto_find_get(tuple->src.l3num); + l3proto = nf_ct_l3proto_find(tuple->src.l3num); ret = ctnetlink_dump_tuples_ip(skb, mask, l3proto); - nf_ct_l3proto_put(l3proto); if (unlikely(ret < 0)) goto nfattr_failure; - l4proto = nf_ct_l4proto_find_get(tuple->src.l3num, tuple->dst.protonum); + l4proto = nf_ct_l4proto_find(tuple->src.l3num, tuple->dst.protonum); ret = ctnetlink_dump_tuples_proto(skb, mask, l4proto); - nf_ct_l4proto_put(l4proto); if (unlikely(ret < 0)) goto nfattr_failure; @@ -1254,37 +1290,47 @@ ctnetlink_get_expect(struct sock *ctnl, if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) return -EINVAL; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + if (nlh->nlmsg_flags & NLM_F_DUMP) { u32 rlen; if ((*errp = netlink_dump_start(ctnl, skb, nlh, ctnetlink_exp_dump_table, - ctnetlink_done)) != 0) - return -EINVAL; + ctnetlink_done)) != 0) { + err = -EINVAL; + goto out_unlock_rcu; + } rlen = NLMSG_ALIGN(nlh->nlmsg_len); if (rlen > skb->len) rlen = skb->len; skb_pull(skb, rlen); - return 0; + err = 0; + goto out_unlock_rcu; } if (cda[CTA_EXPECT_MASTER-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); - else - return -EINVAL; + else { + err = -EINVAL; + goto out_unlock_rcu; + } if (err < 0) - return err; + goto out_unlock_rcu; exp = nf_conntrack_expect_find(&tuple); - if (!exp) - return -ENOENT; + if (!exp) { + err = -ENOENT; + goto out_unlock_rcu; + } if (cda[CTA_EXPECT_ID-1]) { u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); if (exp->id != ntohl(id)) { - nf_conntrack_expect_put(exp); - return -ENOENT; + err = -ENOENT; + goto out; } } @@ -1302,12 +1348,15 @@ ctnetlink_get_expect(struct sock *ctnl, nf_conntrack_expect_put(exp); - return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + goto out_unlock_rcu; free: kfree_skb(skb2); out: nf_conntrack_expect_put(exp); +out_unlock_rcu: + rcu_read_unlock_bh(); return err; } @@ -1325,23 +1374,29 @@ ctnetlink_del_expect(struct sock *ctnl, if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) return -EINVAL; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); if (err < 0) - return err; + goto out_unlock_rcu; /* bump usage count to 2 */ exp = nf_conntrack_expect_find(&tuple); - if (!exp) - return -ENOENT; + if (!exp) { + err = -ENOENT; + goto out_unlock_rcu; + } if (cda[CTA_EXPECT_ID-1]) { u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); if (exp->id != ntohl(id)) { nf_conntrack_expect_put(exp); - return -ENOENT; + err = -ENOENT; + goto out_unlock_rcu; } } @@ -1358,7 +1413,8 @@ ctnetlink_del_expect(struct sock *ctnl, h = __nf_conntrack_helper_find_byname(name); if (!h) { spin_unlock_bh(&nf_conntrack_lock); - return -EINVAL; + err = -EINVAL; + goto out_unlock_rcu; } list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) { @@ -1383,8 +1439,13 @@ ctnetlink_del_expect(struct sock *ctnl, spin_unlock_bh(&nf_conntrack_lock); } - return 0; + err = 0; + +out_unlock_rcu: + rcu_read_unlock_bh(); + return err; } + static int ctnetlink_change_expect(struct nf_conntrack_expect *x, struct nfattr *cda[]) { @@ -1463,9 +1524,12 @@ ctnetlink_new_expect(struct sock *ctnl, || !cda[CTA_EXPECT_MASTER-1]) return -EINVAL; + /* Prevent grace periods during execution of this function */ + rcu_read_lock_bh(); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); if (err < 0) - return err; + goto out_unlock_rcu; spin_lock_bh(&nf_conntrack_lock); exp = __nf_conntrack_expect_find(&tuple); @@ -1475,7 +1539,7 @@ ctnetlink_new_expect(struct sock *ctnl, err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) err = ctnetlink_create_expect(cda, u3); - return err; + goto out_unlock_rcu; } err = -EEXIST; @@ -1483,6 +1547,8 @@ ctnetlink_new_expect(struct sock *ctnl, err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_lock); +out_unlock_rcu: + rcu_read_unlock_bh(); return err; } Index: linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_l4proto.h =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/include/net/netfilter/nf_conntrack_l4proto.h 2006-11-02 19:14:48.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_l4proto.h 2006-11-02 19:15:47.000000000 +0100 @@ -93,12 +93,7 @@ extern struct nf_conntrack_l4proto nf_co extern struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX]; extern struct nf_conntrack_l4proto * -__nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); - -extern struct nf_conntrack_l4proto * -nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t protocol); - -extern void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p); +nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); /* Protocol registration. */ extern int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *proto); Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_expect.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/nf_conntrack_expect.c 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/nf_conntrack_expect.c 2006-11-02 19:15:47.000000000 +0100 @@ -337,8 +337,8 @@ static int exp_seq_show(struct seq_file expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - __nf_ct_l3proto_find(expect->tuple.src.l3num), - __nf_ct_l4proto_find(expect->tuple.src.l3num, + nf_ct_l3proto_find(expect->tuple.src.l3num), + nf_ct_l4proto_find(expect->tuple.src.l3num, expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } Index: linux-2.6.19-rc3-git4.quilt/net/ipv4/netfilter/nf_conntrack_proto_icmp.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/ipv4/netfilter/nf_conntrack_proto_icmp.c 2006-11-02 19:14:44.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/ipv4/netfilter/nf_conntrack_proto_icmp.c 2006-11-02 19:15:47.000000000 +0100 @@ -170,7 +170,7 @@ icmp_error_message(struct sk_buff *skb, return -NF_ACCEPT; } - innerproto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); + innerproto = nf_ct_l4proto_find(PF_INET, inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, dataoff, dataoff + inside->ip.ihl*4, PF_INET, Index: linux-2.6.19-rc3-git4.quilt/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c 2006-11-02 19:14:44.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c 2006-11-02 19:15:47.000000000 +0100 @@ -185,7 +185,7 @@ icmpv6_error_message(struct sk_buff *skb return -NF_ACCEPT; } - inproto = __nf_ct_l4proto_find(PF_INET6, inprotonum); + inproto = nf_ct_l4proto_find(PF_INET6, inprotonum); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuple(skb, inip6off, inprotoff, PF_INET6, inprotonum, Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_connmark.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/xt_connmark.c 2006-10-29 22:58:55.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_connmark.c 2006-11-02 19:15:47.000000000 +0100 @@ -76,9 +76,6 @@ checkentry(const char *tablename, static void destroy(const struct xt_match *match, void *matchinfo) { -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_ct_l3proto_module_put(match->family); -#endif } #ifdef CONFIG_COMPAT Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_conntrack.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/xt_conntrack.c 2006-10-29 22:58:55.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_conntrack.c 2006-11-02 19:15:47.000000000 +0100 @@ -240,9 +240,6 @@ checkentry(const char *tablename, static void destroy(const struct xt_match *match, void *matchinfo) { -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_ct_l3proto_module_put(match->family); -#endif } static struct xt_match conntrack_match = { Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_helper.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/xt_helper.c 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_helper.c 2006-11-02 19:15:47.000000000 +0100 @@ -157,9 +157,6 @@ static int check(const char *tablename, static void destroy(const struct xt_match *match, void *matchinfo) { -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_ct_l3proto_module_put(match->family); -#endif } static struct xt_match xt_helper_match[] = { Index: linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_state.c =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/net/netfilter/xt_state.c 2006-10-29 22:58:55.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/net/netfilter/xt_state.c 2006-11-02 19:15:47.000000000 +0100 @@ -63,9 +63,6 @@ static int check(const char *tablename, static void destroy(const struct xt_match *match, void *matchinfo) { -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - nf_ct_l3proto_module_put(match->family); -#endif } static struct xt_match xt_state_match[] = { Index: linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_core.h =================================================================== --- linux-2.6.19-rc3-git4.quilt.orig/include/net/netfilter/nf_conntrack_core.h 2006-11-02 19:14:56.000000000 +0100 +++ linux-2.6.19-rc3-git4.quilt/include/net/netfilter/nf_conntrack_core.h 2006-11-02 19:15:47.000000000 +0100 @@ -83,4 +83,6 @@ extern struct list_head nf_conntrack_exp extern spinlock_t nf_conntrack_lock ; extern struct list_head unconfirmed; +extern struct list_head nf_deleted; + #endif /* _NF_CONNTRACK_CORE_H */