Name: Speed up locking for connection tracking Author: Rusty Russell Status: Experimental Depends: Netfilter/conntrack_helper_lock.patch.gz D: This introduces more fine-grained locking for connection tracking, D: using one lock per hash chain, rather than one lock for the whole D: lock. Thanks to Martin Josefsson for bug-spotting. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .22339-2.4.19-conntrack-speed-locking.pre/include/linux/netfilter_ipv4/ip_conntrack_core.h .22339-2.4.19-conntrack-speed-locking/include/linux/netfilter_ipv4/ip_conntrack_core.h --- .22339-2.4.19-conntrack-speed-locking.pre/include/linux/netfilter_ipv4/ip_conntrack_core.h 2002-08-22 16:21:02.000000000 +1000 +++ .22339-2.4.19-conntrack-speed-locking/include/linux/netfilter_ipv4/ip_conntrack_core.h 2002-08-22 16:21:04.000000000 +1000 @@ -45,8 +45,15 @@ static inline int ip_conntrack_confirm(s return NF_ACCEPT; } -extern struct list_head *ip_conntrack_hash; +struct ip_conntrack_hash +{ + spinlock_t lock; + struct list_head list; +}; + +extern struct ip_conntrack_hash *ip_conntrack_hash; extern struct list_head expect_list; -DECLARE_RWLOCK_EXTERN(ip_conntrack_lock); +/* This nests inside the specific hash lock, if required */ +extern spinlock_t expected_lock; #endif /* _IP_CONNTRACK_CORE_H */ diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_conntrack_core.c .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_conntrack_core.c --- .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_conntrack_core.c 2002-08-22 16:21:03.000000000 +1000 +++ .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_conntrack_core.c 2002-08-22 16:21:04.000000000 +1000 @@ -26,16 +26,10 @@ /* For ERR_PTR(). Yeah, I know... --RR */ #include -/* This rwlock protects the main hash table, expected - registrations, conntrack timers*/ -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) - #include #include #include #include -#include #if 0 #define DEBUGP printk @@ -43,10 +37,10 @@ #define DEBUGP(format, args...) #endif -DECLARE_RWLOCK(ip_conntrack_lock); - void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; LIST_HEAD(expect_list); +/* This nests inside the specific hash lock, if required */ +spinlock_t expected_lock = SPIN_LOCK_UNLOCKED; /* These two are protected by BR_NETPROTO_LOCK */ LIST_HEAD(protocol_list); @@ -55,7 +49,7 @@ static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; static int ip_conntrack_max = 0; static atomic_t ip_conntrack_count = ATOMIC_INIT(0); -struct list_head *ip_conntrack_hash; +struct ip_conntrack_hash *ip_conntrack_hash; static kmem_cache_t *ip_conntrack_cachep; extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; @@ -136,24 +130,57 @@ invert_tuple(struct ip_conntrack_tuple * return protocol->invert_tuple(inverse, orig); } +/* Double lock/unlock */ +#define DOUBLE_OP(tuple_a, tuple_b, op) \ + do { \ + u32 hash_a, hash_b; \ + \ + hash_a = hash_conntrack(tuple_a); \ + hash_b = hash_conntrack(tuple_b); \ + if (hash_a < hash_b) { \ + op(&ip_conntrack_hash[hash_a].lock); \ + op(&ip_conntrack_hash[hash_b].lock); \ + } else if (hash_a > hash_b) { \ + op(&ip_conntrack_hash[hash_b].lock); \ + op(&ip_conntrack_hash[hash_a].lock); \ + } else \ + op(&ip_conntrack_hash[hash_a].lock); \ + } while(0) + +/* Double lock for both hashes of conntrack */ +static inline void +lock_conntrack(struct ip_conntrack *ct) +{ + DOUBLE_OP(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + spin_lock_bh); +} + +/* Double unlock */ +static void +unlock_conntrack(struct ip_conntrack *ct) +{ + DOUBLE_OP(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + spin_unlock_bh); +} + +/* Must hold both conntrack locks and expected lock */ static void clean_from_lists(struct ip_conntrack *ct) { - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); /* Remove from both hash lists: must not NULL out next ptrs, - otherwise we'll look unconfirmed. Fortunately, LIST_DELETE + otherwise we'll look unconfirmed. Fortunately, list_del doesn't do this. --RR */ - LIST_DELETE(&ip_conntrack_hash - [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - LIST_DELETE(&ip_conntrack_hash - [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)], - &ct->tuplehash[IP_CT_DIR_REPLY]); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list); /* If our expected is in the list, take it out. */ if (ct->expected.expectant) { + spin_lock_bh(&expected_lock); IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected)); IP_NF_ASSERT(ct->expected.expectant == ct); - LIST_DELETE(&expect_list, &ct->expected); + list_del(&ct->expected.list); + spin_unlock_bh(&expected_lock); } } @@ -169,9 +196,6 @@ destroy_conntrack(struct nf_conntrack *n if (ct->master.master) nf_conntrack_put(&ct->master); - /* To make sure we don't get any weird locking issues here: - * destroy_conntrack() MUST NOT be called with a write lock - * to ip_conntrack_lock!!! -HW */ br_read_lock(BR_NETPROTO_LOCK); proto = find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); br_read_unlock(BR_NETPROTO_LOCK); @@ -188,34 +212,25 @@ static void death_by_timeout(unsigned lo { struct ip_conntrack *ct = (void *)ul_conntrack; - WRITE_LOCK(&ip_conntrack_lock); + lock_conntrack(ct); clean_from_lists(ct); - WRITE_UNLOCK(&ip_conntrack_lock); + unlock_conntrack(ct); ip_conntrack_put(ct); } -static inline int -conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, - const struct ip_conntrack_tuple *tuple, - const struct ip_conntrack *ignored_conntrack) -{ - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - return i->ctrack != ignored_conntrack - && ip_ct_tuple_equal(tuple, &i->tuple); -} - static struct ip_conntrack_tuple_hash * -__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, +__ip_conntrack_find(unsigned int hash, + const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - tuple, ignored_conntrack); - return h; + list_for_each_entry(h, &ip_conntrack_hash[hash].list, list) { + if (h->ctrack != ignored_conntrack + && ip_ct_tuple_equal(tuple, &h->tuple)) + return h; + } + return NULL; } /* Find a connection corresponding to a tuple. */ @@ -224,16 +239,18 @@ ip_conntrack_find_get(const struct ip_co const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); - READ_LOCK(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); + spin_lock_bh(&ip_conntrack_hash[hash].lock); + h = __ip_conntrack_find(hash, tuple, ignored_conntrack); if (h) atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + spin_unlock_bh(&ip_conntrack_hash[hash].lock); return h; } +/* Get conntrack from skb's nfct: does *not* require any locks */ static inline struct ip_conntrack * __ip_conntrack_get(struct nf_ct_info *nfct, enum ip_conntrack_info *ctinfo) { @@ -262,6 +279,7 @@ __ip_conntrack_confirm(struct nf_ct_info unsigned int hash, repl_hash; struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; + struct ip_conntrack_tuple_hash *h; ct = __ip_conntrack_get(nfct, &ctinfo); @@ -285,47 +303,48 @@ __ip_conntrack_confirm(struct nf_ct_info IP_NF_ASSERT(!is_confirmed(ct)); DEBUGP("Confirming conntrack %p\n", ct); - WRITE_LOCK(&ip_conntrack_lock); + lock_conntrack(ct); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - if (!LIST_FIND(&ip_conntrack_hash[hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) - && !LIST_FIND(&ip_conntrack_hash[repl_hash], - conntrack_tuple_cmp, - struct ip_conntrack_tuple_hash *, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); - /* Timer relative to confirmation time, not original - setting time, otherwise we'd get timer wrap in - wierd delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); - atomic_inc(&ct->ct_general.use); - WRITE_UNLOCK(&ip_conntrack_lock); - return NF_ACCEPT; - } + list_for_each_entry(h, &ip_conntrack_hash[hash].list, list) + if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple)) + goto already_taken; + list_for_each_entry(h, &ip_conntrack_hash[repl_hash].list, list) + if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple)) + goto already_taken; - WRITE_UNLOCK(&ip_conntrack_lock); + list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL], &ip_conntrack_hash[hash]); + list_prepend(&ct->tuplehash[IP_CT_DIR_REPLY], + &ip_conntrack_hash[repl_hash]); + /* Timer relative to confirmation time, not original setting + time, otherwise we'd get timer wrap in wierd delay + cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + unlock_conntrack(ct); + return NF_ACCEPT; + + already_taken: + unlock_conntrack(ct); return NF_DROP; } -/* Returns true if a connection correspondings to the tuple (required - for NAT). */ +/* Returns true if a connection corresponds to the tuple (required for + NAT). */ int ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); - READ_LOCK(&ip_conntrack_lock); - h = __ip_conntrack_find(tuple, ignored_conntrack); - READ_UNLOCK(&ip_conntrack_lock); + spin_lock_bh(&ip_conntrack_hash[hash].lock); + h = __ip_conntrack_find(hash, tuple, ignored_conntrack); + spin_unlock_bh(&ip_conntrack_hash[hash].lock); return h != NULL; } @@ -424,39 +443,30 @@ icmp_error_track(struct sk_buff *skb, /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static inline int unreplied(const struct ip_conntrack_tuple_hash *i) -{ - return !(i->ctrack->status & IPS_ASSURED); -} - -static int early_drop(struct list_head *chain) +static int early_drop(unsigned int hash) { - /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct list_head *i; struct ip_conntrack_tuple_hash *h; int dropped = 0; - READ_LOCK(&ip_conntrack_lock); - h = LIST_FIND(chain, unreplied, struct ip_conntrack_tuple_hash *); - if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); - - if (!h) - return dropped; - - if (del_timer(&h->ctrack->timeout)) { - death_by_timeout((unsigned long)h->ctrack); - dropped = 1; + spin_lock_bh(&ip_conntrack_hash[hash].lock); + /* Traverse backwards: gives us oldest, which is roughly LRU */ + list_for_each_prev(i, &ip_conntrack_hash[hash].list) { + h = list_entry(i, struct ip_conntrack_tuple_hash, list); + if (!(h->ctrack->status & IPS_ASSURED)) { + /* Grab it, unlock, try to kill. */ + atomic_inc(&h->ctrack->ct_general.use); + spin_unlock_bh(&ip_conntrack_hash[hash].lock); + if (del_timer(&h->ctrack->timeout)) { + death_by_timeout((unsigned long)h->ctrack); + dropped = 1; + } + ip_conntrack_put(h->ctrack); + return dropped; + } } - ip_conntrack_put(h->ctrack); - return dropped; -} - -/* Compare parts depending on mask. */ -static inline int expect_cmp(const struct ip_conntrack_expect *i, - const struct ip_conntrack_tuple *tuple) -{ - return ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask); + spin_unlock_bh(&ip_conntrack_hash[hash].lock); + return 0; } /* Allocate a new conntrack: we return -ENOMEM if classification @@ -467,10 +477,10 @@ init_conntrack(const struct ip_conntrack struct sk_buff *skb) { struct ip_conntrack *conntrack; - struct ip_conntrack_helper *helper; struct ip_conntrack_tuple repl_tuple; size_t hash, repl_hash; struct ip_conntrack_expect *expected; + struct ip_conntrack_helper *helper; int i; static unsigned int drop_next = 0; @@ -483,8 +493,7 @@ init_conntrack(const struct ip_conntrack bomb one hash chain). */ unsigned int next = (drop_next++)%ip_conntrack_htable_size; - if (!early_drop(&ip_conntrack_hash[next]) - && !early_drop(&ip_conntrack_hash[hash])) { + if (!early_drop(next) && !early_drop(hash)) { if (net_ratelimit()) printk(KERN_WARNING "ip_conntrack: table full, dropping" @@ -524,7 +533,7 @@ init_conntrack(const struct ip_conntrack conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; - /* Mark clearly that it's not in the hash table. */ + /* Mark clearly that it's not in the hash table (ie. unconfirmed) */ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list.next = NULL; /* We hold BR_NETPROTO_LOCK */ @@ -537,30 +546,30 @@ init_conntrack(const struct ip_conntrack } } - /* Write lock required for deletion of expected. Without - this, a read-lock would do. */ - WRITE_LOCK(&ip_conntrack_lock); - /* Need finding and deleting of expected ONLY if we win race */ - expected = LIST_FIND(&expect_list, expect_cmp, - struct ip_conntrack_expect *, tuple); - /* If master is not in hash table yet (ie. packet hasn't left - this machine yet), how can other end know about expected? - Hence these are not the droids you are looking for (if - master ct never got confirmed, we'd hold a reference to it - and weird things would happen to future packets). */ - if (expected && is_confirmed(expected->expectant)) { - /* Welcome, Mr. Bond. We've been expecting you... */ - conntrack->status = IPS_EXPECTED; - conntrack->master.master = &expected->expectant->ct_general; - IP_NF_ASSERT(conntrack->master.master); - LIST_DELETE(&expect_list, expected); - expected->expectant = NULL; - nf_conntrack_get(&conntrack->master); + spin_lock_bh(&expected_lock); + list_for_each_entry(expected, &expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &expected->tuple, + &expected->mask)) { + /* If master is not in hash table yet + (ie. packet hasn't left this machine yet), + how can other end know about expected? */ + if (is_confirmed(expected->expectant)) { + conntrack->status = IPS_EXPECTED; + conntrack->master.master + = &expected->expectant->ct_general; + IP_NF_ASSERT(conntrack->master.master); + list_del(&expected->list); + expected->expectant = NULL; + nf_conntrack_get(&conntrack->master); + } + break; + } } - atomic_inc(&ip_conntrack_count); - WRITE_UNLOCK(&ip_conntrack_lock); + spin_unlock_bh(&expected_lock); - if (expected && expected->expectfn) + /* We're done. Call expectfn outside lock. */ + atomic_inc(&ip_conntrack_count); + if (&expected->list != &expect_list && expected->expectfn) expected->expectfn(conntrack); return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; } @@ -709,7 +718,6 @@ int invert_tuplepr(struct ip_conntrack_t static void unexpect_related(struct ip_conntrack *related_to) { - MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); list_del(&related_to->expected.list); related_to->expected.expectant = NULL; } @@ -736,7 +744,9 @@ int ip_conntrack_expect_related(struct i const struct ip_conntrack_tuple *mask, int (*expectfn)(struct ip_conntrack *)) { - WRITE_LOCK(&ip_conntrack_lock); + struct ip_conntrack_expect *i; + + spin_lock_bh(&expected_lock); if (related_to->expected.expectant) unexpect_related(related_to); @@ -744,26 +754,26 @@ int ip_conntrack_expect_related(struct i related_to->expected.mask = *mask; related_to->expected.expectfn = expectfn; - if (LIST_FIND(&expect_list, expect_clash, - struct ip_conntrack_expect *, &related_to->expected)) { - WRITE_UNLOCK(&ip_conntrack_lock); - return -EBUSY; + list_for_each_entry(i, &expect_list, list) { + if (expect_clash(i, &related_to->expected)) { + spin_unlock_bh(&expected_lock); + return -EBUSY; + } } list_prepend(&expect_list, &related_to->expected); related_to->expected.expectant = related_to; - WRITE_UNLOCK(&ip_conntrack_lock); - + spin_unlock_bh(&expected_lock); return 0; } void ip_conntrack_unexpect_related(struct ip_conntrack *related_to) { - WRITE_LOCK(&ip_conntrack_lock); + spin_lock_bh(&expected_lock); unexpect_related(related_to); - WRITE_UNLOCK(&ip_conntrack_lock); + spin_unlock_bh(&expected_lock); } - + /* Alter reply tuple (maybe alter helper). This is for NAT, and is implicitly racy: see __ip_conntrack_confirm */ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, @@ -771,7 +781,7 @@ void ip_conntrack_alter_reply(struct ip_ { struct ip_conntrack_helper *helper; - /* Should be unconfirmed, so not in hash table yet */ + /* Must be unconfirmed (ie. not in hash table yet) */ IP_NF_ASSERT(!is_confirmed(conntrack)); DEBUGP("Altering reply tuple of %p to ", conntrack); @@ -842,7 +852,7 @@ void ip_ct_refresh(struct ip_conntrack * { IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); - WRITE_LOCK(&ip_conntrack_lock); + lock_conntrack(ct); /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) ct->timeout.expires = extra_jiffies; @@ -853,7 +863,7 @@ void ip_ct_refresh(struct ip_conntrack * add_timer(&ct->timeout); } } - WRITE_UNLOCK(&ip_conntrack_lock); + unlock_conntrack(ct); } /* Returns new sk_buff, or NULL */ @@ -926,22 +936,25 @@ do_kill(const struct ip_conntrack_tuple_ /* Bring out ya dead! */ static struct ip_conntrack_tuple_hash * -get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data), +get_next_corpse(int (*should_kill)(const struct ip_conntrack *i, void *data), void *data) { - struct ip_conntrack_tuple_hash *h = NULL; unsigned int i; - READ_LOCK(&ip_conntrack_lock); - for (i = 0; !h && i < ip_conntrack_htable_size; i++) { - h = LIST_FIND(&ip_conntrack_hash[i], do_kill, - struct ip_conntrack_tuple_hash *, kill, data); - } - if (h) - atomic_inc(&h->ctrack->ct_general.use); - READ_UNLOCK(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + struct ip_conntrack_tuple_hash *h; - return h; + spin_lock_bh(&ip_conntrack_hash[i].lock); + list_for_each_entry(h, &ip_conntrack_hash[i].list, list) { + if (should_kill(h->ctrack, data)) { + atomic_inc(&h->ctrack->ct_general.use); + spin_unlock_bh(&ip_conntrack_hash[i].lock); + return h; + } + } + spin_unlock_bh(&ip_conntrack_hash[i].lock); + } + return NULL; } void @@ -1099,7 +1112,7 @@ int __init ip_conntrack_init(void) if (ret != 0) return ret; - ip_conntrack_hash = vmalloc(sizeof(struct list_head) + ip_conntrack_hash = vmalloc(sizeof(struct ip_conntrack_hash) * ip_conntrack_htable_size); if (!ip_conntrack_hash) { nf_unregister_sockopt(&so_getorigdst); @@ -1115,7 +1128,7 @@ int __init ip_conntrack_init(void) nf_unregister_sockopt(&so_getorigdst); return -ENOMEM; } - + /* Don't NEED lock here, but good form anyway. */ br_write_lock_bh(BR_NETPROTO_LOCK); /* Sew in builtin protocols. */ @@ -1124,8 +1137,10 @@ int __init ip_conntrack_init(void) list_add(&ip_conntrack_protocol_icmp.list, &protocol_list); br_write_unlock_bh(BR_NETPROTO_LOCK); - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); + for (i = 0; i < ip_conntrack_htable_size; i++) { + spin_lock_init(&ip_conntrack_hash[i].lock); + INIT_LIST_HEAD(&ip_conntrack_hash[i].list); + } /* This is fucking braindead. There is NO WAY of doing this without the CONFIG_SYSCTL unless you don't want to detect errors. diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_conntrack_standalone.c .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_conntrack_standalone.c --- .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_conntrack_standalone.c 2002-08-22 16:21:02.000000000 +1000 +++ .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_conntrack_standalone.c 2002-08-22 16:21:04.000000000 +1000 @@ -18,14 +18,10 @@ #include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) - #include #include #include #include -#include #if 0 #define DEBUGP printk @@ -112,8 +108,6 @@ conntrack_iterate(const struct ip_conntr unsigned int newlen; IP_NF_ASSERT(hash->ctrack); - MUST_BE_READ_LOCKED(&ip_conntrack_lock); - /* Only count originals */ if (DIRECTION(hash)) return 0; @@ -135,36 +129,41 @@ list_conntracks(char *buffer, char **sta unsigned int i; unsigned int len = 0; off_t upto = 0; - struct list_head *e; + struct ip_conntrack_expect *e; br_read_lock(BR_NETPROTO_LOCK); - READ_LOCK(&ip_conntrack_lock); + /* Traverse hash; print originals then reply. */ for (i = 0; i < ip_conntrack_htable_size; i++) { - if (LIST_FIND(&ip_conntrack_hash[i], conntrack_iterate, - struct ip_conntrack_tuple_hash *, - buffer, offset, &upto, &len, length)) - goto finished; + struct ip_conntrack_tuple_hash *h; + spin_lock_bh(&ip_conntrack_hash[i].lock); + list_for_each_entry(h, &ip_conntrack_hash[i].list, list) { + if (conntrack_iterate(h, buffer, offset, &upto, &len, + length)) { + spin_unlock_bh(&ip_conntrack_hash[i].lock); + goto finished; + } + } + spin_unlock_bh(&ip_conntrack_hash[i].lock); } /* Now iterate through expecteds. */ - for (e = expect_list.next; e != &expect_list; e = e->next) { + spin_lock_bh(&expected_lock); + list_for_each_entry(e, &expect_list, list) { unsigned int last_len; - struct ip_conntrack_expect *expect - = (struct ip_conntrack_expect *)e; if (upto++ < offset) continue; last_len = len; - len += print_expect(buffer + len, expect); + len += print_expect(buffer + len, e); if (len > length) { len = last_len; - goto finished; + break; } } + spin_unlock_bh(&expected_lock); finished: br_read_unlock(BR_NETPROTO_LOCK); - READ_UNLOCK(&ip_conntrack_lock); /* `start' hack - see fs/proc/generic.c line ~165 */ *start = (char *)((unsigned int)upto - offset); diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_fw_compat_masq.c .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_fw_compat_masq.c --- .22339-2.4.19-conntrack-speed-locking.pre/net/ipv4/netfilter/ip_fw_compat_masq.c 2002-08-22 16:21:02.000000000 +1000 +++ .22339-2.4.19-conntrack-speed-locking/net/ipv4/netfilter/ip_fw_compat_masq.c 2002-08-22 16:21:04.000000000 +1000 @@ -15,16 +15,13 @@ #include #include #include +#include #include -#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) -#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) - #include #include #include #include -#include #if 0 #define DEBUGP printk @@ -286,15 +283,22 @@ masq_procinfo(char *buffer, char **start offset = 1; } - READ_LOCK(&ip_conntrack_lock); + br_read_lock(BR_NETPROTO_LOCK); + /* Traverse hash; print originals then reply. */ for (i = 0; i < ip_conntrack_htable_size; i++) { - if (LIST_FIND(&ip_conntrack_hash[i], masq_iterate, - struct ip_conntrack_tuple_hash *, - buffer, offset, &upto, &len, length)) - break; + struct ip_conntrack_tuple_hash *h; + spin_lock_bh(&ip_conntrack_hash[i].lock); + list_for_each_entry(h, &ip_conntrack_hash[i].list, list) { + if (masq_iterate(h, buffer, offset, &upto, &len, + length)) { + spin_unlock_bh(&ip_conntrack_hash[i].lock); + break; + } + } + spin_unlock_bh(&ip_conntrack_hash[i].lock); } - READ_UNLOCK(&ip_conntrack_lock); + br_read_unlock(BR_NETPROTO_LOCK); /* `start' hack - see fs/proc/generic.c line ~165 */ *start = (char *)((unsigned int)upto - offset);