Commit 3ab5aee7 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls

RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
  price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.

This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.

Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.

__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)

Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)
Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 88ab1932
...@@ -41,8 +41,8 @@ ...@@ -41,8 +41,8 @@
* I'll experiment with dynamic table growth later. * I'll experiment with dynamic table growth later.
*/ */
struct inet_ehash_bucket { struct inet_ehash_bucket {
struct hlist_head chain; struct hlist_nulls_head chain;
struct hlist_head twchain; struct hlist_nulls_head twchain;
}; };
/* There are a few simple rules, which allow for local port reuse by /* There are a few simple rules, which allow for local port reuse by
......
...@@ -110,7 +110,7 @@ struct inet_timewait_sock { ...@@ -110,7 +110,7 @@ struct inet_timewait_sock {
#define tw_state __tw_common.skc_state #define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse #define tw_reuse __tw_common.skc_reuse
#define tw_bound_dev_if __tw_common.skc_bound_dev_if #define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_node #define tw_node __tw_common.skc_nulls_node
#define tw_bind_node __tw_common.skc_bind_node #define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt #define tw_refcnt __tw_common.skc_refcnt
#define tw_hash __tw_common.skc_hash #define tw_hash __tw_common.skc_hash
...@@ -137,10 +137,10 @@ struct inet_timewait_sock { ...@@ -137,10 +137,10 @@ struct inet_timewait_sock {
struct hlist_node tw_death_node; struct hlist_node tw_death_node;
}; };
static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, static inline void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
struct hlist_head *list) struct hlist_nulls_head *list)
{ {
hlist_add_head(&tw->tw_node, list); hlist_nulls_add_head_rcu(&tw->tw_node, list);
} }
static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
...@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw) ...@@ -175,7 +175,7 @@ static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
} }
#define inet_twsk_for_each(tw, node, head) \ #define inet_twsk_for_each(tw, node, head) \
hlist_for_each_entry(tw, node, head, tw_node) hlist_nulls_for_each_entry(tw, node, head, tw_node)
#define inet_twsk_for_each_inmate(tw, node, jail) \ #define inet_twsk_for_each_inmate(tw, node, jail) \
hlist_for_each_entry(tw, node, jail, tw_death_node) hlist_for_each_entry(tw, node, jail, tw_death_node)
......
...@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab) ...@@ -2082,7 +2082,9 @@ int proto_register(struct proto *prot, int alloc_slab)
prot->twsk_prot->twsk_slab = prot->twsk_prot->twsk_slab =
kmem_cache_create(timewait_sock_slab_name, kmem_cache_create(timewait_sock_slab_name,
prot->twsk_prot->twsk_obj_size, prot->twsk_prot->twsk_obj_size,
0, SLAB_HWCACHE_ALIGN, 0,
SLAB_HWCACHE_ALIGN |
prot->slab_flags,
NULL); NULL);
if (prot->twsk_prot->twsk_slab == NULL) if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name; goto out_free_timewait_sock_slab_name;
......
...@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = { ...@@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = {
.orphan_count = &dccp_orphan_count, .orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER, .max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock), .obj_size = sizeof(struct dccp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.rsk_prot = &dccp_request_sock_ops, .rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops, .twsk_prot = &dccp_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo, .h.hashinfo = &dccp_hashinfo,
......
...@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = { ...@@ -1140,6 +1140,7 @@ static struct proto dccp_v6_prot = {
.orphan_count = &dccp_orphan_count, .orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER, .max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp6_sock), .obj_size = sizeof(struct dccp6_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.rsk_prot = &dccp6_request_sock_ops, .rsk_prot = &dccp6_request_sock_ops,
.twsk_prot = &dccp6_timewait_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops,
.h.hashinfo = &dccp_hashinfo, .h.hashinfo = &dccp_hashinfo,
......
...@@ -1090,8 +1090,8 @@ static int __init dccp_init(void) ...@@ -1090,8 +1090,8 @@ static int __init dccp_init(void)
} }
for (i = 0; i < dccp_hashinfo.ehash_size; i++) { for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i);
} }
if (inet_ehash_locks_alloc(&dccp_hashinfo)) if (inet_ehash_locks_alloc(&dccp_hashinfo))
......
...@@ -778,18 +778,19 @@ skip_listen_ht: ...@@ -778,18 +778,19 @@ skip_listen_ht:
struct inet_ehash_bucket *head = &hashinfo->ehash[i]; struct inet_ehash_bucket *head = &hashinfo->ehash[i];
rwlock_t *lock = inet_ehash_lockp(hashinfo, i); rwlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct sock *sk; struct sock *sk;
struct hlist_node *node; struct hlist_nulls_node *node;
num = 0; num = 0;
if (hlist_empty(&head->chain) && hlist_empty(&head->twchain)) if (hlist_nulls_empty(&head->chain) &&
hlist_nulls_empty(&head->twchain))
continue; continue;
if (i > s_i) if (i > s_i)
s_num = 0; s_num = 0;
read_lock_bh(lock); read_lock_bh(lock);
sk_for_each(sk, node, &head->chain) { sk_nulls_for_each(sk, node, &head->chain) {
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
if (num < s_num) if (num < s_num)
......
...@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net, ...@@ -223,35 +223,65 @@ struct sock * __inet_lookup_established(struct net *net,
INET_ADDR_COOKIE(acookie, saddr, daddr) INET_ADDR_COOKIE(acookie, saddr, daddr)
const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk; struct sock *sk;
const struct hlist_node *node; const struct hlist_nulls_node *node;
/* Optimize here for direct hit, only listening connections can /* Optimize here for direct hit, only listening connections can
* have wildcards anyways. * have wildcards anyways.
*/ */
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); unsigned int slot = hash & (hashinfo->ehash_size - 1);
rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
prefetch(head->chain.first); rcu_read_lock();
read_lock(lock); begin:
sk_for_each(sk, node, &head->chain) { sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (INET_MATCH(sk, net, hash, acookie, if (INET_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif)) saddr, daddr, ports, dif)) {
goto hit; /* You sunk my battleship! */ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
goto begintw;
if (unlikely(!INET_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif))) {
sock_put(sk);
goto begin;
}
goto out;
}
} }
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begin;
begintw:
/* Must check for a TIME_WAIT'er before going to listener hash. */ /* Must check for a TIME_WAIT'er before going to listener hash. */
sk_for_each(sk, node, &head->twchain) { sk_nulls_for_each_rcu(sk, node, &head->twchain) {
if (INET_TW_MATCH(sk, net, hash, acookie, if (INET_TW_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif)) saddr, daddr, ports, dif)) {
goto hit; if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
sk = NULL;
goto out;
}
if (unlikely(!INET_TW_MATCH(sk, net, hash, acookie,
saddr, daddr, ports, dif))) {
sock_put(sk);
goto begintw;
} }
goto out;
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begintw;
sk = NULL; sk = NULL;
out: out:
read_unlock(lock); rcu_read_unlock();
return sk; return sk;
hit:
sock_hold(sk);
goto out;
} }
EXPORT_SYMBOL_GPL(__inet_lookup_established); EXPORT_SYMBOL_GPL(__inet_lookup_established);
...@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, ...@@ -272,14 +302,14 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
rwlock_t *lock = inet_ehash_lockp(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2; struct sock *sk2;
const struct hlist_node *node; const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
prefetch(head->chain.first); prefetch(head->chain.first);
write_lock(lock); write_lock(lock);
/* Check TIME-WAIT sockets first. */ /* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &head->twchain) { sk_nulls_for_each(sk2, node, &head->twchain) {
tw = inet_twsk(sk2); tw = inet_twsk(sk2);
if (INET_TW_MATCH(sk2, net, hash, acookie, if (INET_TW_MATCH(sk2, net, hash, acookie,
...@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, ...@@ -293,7 +323,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
tw = NULL; tw = NULL;
/* And established part... */ /* And established part... */
sk_for_each(sk2, node, &head->chain) { sk_nulls_for_each(sk2, node, &head->chain) {
if (INET_MATCH(sk2, net, hash, acookie, if (INET_MATCH(sk2, net, hash, acookie,
saddr, daddr, ports, dif)) saddr, daddr, ports, dif))
goto not_unique; goto not_unique;
...@@ -306,7 +336,7 @@ unique: ...@@ -306,7 +336,7 @@ unique:
inet->sport = htons(lport); inet->sport = htons(lport);
sk->sk_hash = hash; sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk)); WARN_ON(!sk_unhashed(sk));
__sk_add_node(sk, &head->chain); __sk_nulls_add_node_rcu(sk, &head->chain);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock); write_unlock(lock);
...@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk) ...@@ -338,7 +368,7 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
void __inet_hash_nolisten(struct sock *sk) void __inet_hash_nolisten(struct sock *sk)
{ {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_head *list; struct hlist_nulls_head *list;
rwlock_t *lock; rwlock_t *lock;
struct inet_ehash_bucket *head; struct inet_ehash_bucket *head;
...@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk) ...@@ -350,7 +380,7 @@ void __inet_hash_nolisten(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash); lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
write_lock(lock); write_lock(lock);
__sk_add_node(sk, list); __sk_nulls_add_node_rcu(sk, list);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock); write_unlock(lock);
} }
...@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk) ...@@ -400,13 +430,15 @@ void inet_unhash(struct sock *sk)
local_bh_disable(); local_bh_disable();
inet_listen_wlock(hashinfo); inet_listen_wlock(hashinfo);
lock = &hashinfo->lhash_lock; lock = &hashinfo->lhash_lock;
if (__sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
} else { } else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash); lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
write_lock_bh(lock); write_lock_bh(lock);
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
} }
if (__sk_del_node_init(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
write_unlock_bh(lock); write_unlock_bh(lock);
out: out:
if (sk->sk_state == TCP_LISTEN) if (sk->sk_state == TCP_LISTEN)
......
...@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, ...@@ -23,12 +23,12 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
write_lock(lock); write_lock(lock);
if (hlist_unhashed(&tw->tw_node)) { if (hlist_nulls_unhashed(&tw->tw_node)) {
write_unlock(lock); write_unlock(lock);
return; return;
} }
__hlist_del(&tw->tw_node); hlist_nulls_del_rcu(&tw->tw_node);
sk_node_init(&tw->tw_node); sk_nulls_node_init(&tw->tw_node);
write_unlock(lock); write_unlock(lock);
/* Disassociate with bind bucket. */ /* Disassociate with bind bucket. */
...@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, ...@@ -92,13 +92,17 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
write_lock(lock); write_lock(lock);
/* Step 2: Remove SK from established hash. */ /*
if (__sk_del_node_init(sk)) * Step 2: Hash TW into TIMEWAIT chain.
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); * Should be done before removing sk from established chain
* because readers are lockless and search established first.
/* Step 3: Hash TW into TIMEWAIT chain. */ */
inet_twsk_add_node(tw, &ehead->twchain);
atomic_inc(&tw->tw_refcnt); atomic_inc(&tw->tw_refcnt);
inet_twsk_add_node_rcu(tw, &ehead->twchain);
/* Step 3: Remove SK from established hash. */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
write_unlock(lock); write_unlock(lock);
} }
...@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, ...@@ -416,7 +420,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
{ {
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
struct sock *sk; struct sock *sk;
struct hlist_node *node; struct hlist_nulls_node *node;
int h; int h;
local_bh_disable(); local_bh_disable();
...@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, ...@@ -426,7 +430,7 @@ void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
rwlock_t *lock = inet_ehash_lockp(hashinfo, h); rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
restart: restart:
write_lock(lock); write_lock(lock);
sk_for_each(sk, node, &head->twchain) { sk_nulls_for_each(sk, node, &head->twchain) {
tw = inet_twsk(sk); tw = inet_twsk(sk);
if (!net_eq(twsk_net(tw), net) || if (!net_eq(twsk_net(tw), net) ||
......
...@@ -2707,8 +2707,8 @@ void __init tcp_init(void) ...@@ -2707,8 +2707,8 @@ void __init tcp_init(void)
thash_entries ? 0 : 512 * 1024); thash_entries ? 0 : 512 * 1024);
tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
for (i = 0; i < tcp_hashinfo.ehash_size; i++) { for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
} }
if (inet_ehash_locks_alloc(&tcp_hashinfo)) if (inet_ehash_locks_alloc(&tcp_hashinfo))
panic("TCP: failed to alloc ehash_locks"); panic("TCP: failed to alloc ehash_locks");
......
...@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); ...@@ -1857,16 +1857,16 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */ /* Proc filesystem TCP sock list dumping. */
static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
{ {
return hlist_empty(head) ? NULL : return hlist_nulls_empty(head) ? NULL :
list_entry(head->first, struct inet_timewait_sock, tw_node); list_entry(head->first, struct inet_timewait_sock, tw_node);
} }
static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
{ {
return tw->tw_node.next ? return !is_a_nulls(tw->tw_node.next) ?
hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
} }
static void *listening_get_next(struct seq_file *seq, void *cur) static void *listening_get_next(struct seq_file *seq, void *cur)
...@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) ...@@ -1954,8 +1954,8 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
static inline int empty_bucket(struct tcp_iter_state *st) static inline int empty_bucket(struct tcp_iter_state *st)
{ {
return hlist_empty(&tcp_hashinfo.ehash[st->bucket].chain) && return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
hlist_empty(&tcp_hashinfo.ehash[st->bucket].twchain); hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
} }
static void *established_get_first(struct seq_file *seq) static void *established_get_first(struct seq_file *seq)
...@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq) ...@@ -1966,7 +1966,7 @@ static void *established_get_first(struct seq_file *seq)
for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
struct sock *sk; struct sock *sk;
struct hlist_node *node; struct hlist_nulls_node *node;
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
...@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq) ...@@ -1975,7 +1975,7 @@ static void *established_get_first(struct seq_file *seq)
continue; continue;
read_lock_bh(lock); read_lock_bh(lock);
sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
if (sk->sk_family != st->family || if (sk->sk_family != st->family ||
!net_eq(sock_net(sk), net)) { !net_eq(sock_net(sk), net)) {
continue; continue;
...@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) ...@@ -2004,7 +2004,7 @@ static void *established_get_next(struct seq_file *seq, void *cur)
{ {
struct sock *sk = cur; struct sock *sk = cur;
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
struct hlist_node *node; struct hlist_nulls_node *node;
struct tcp_iter_state *st = seq->private; struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq); struct net *net = seq_file_net(seq);
...@@ -2032,11 +2032,11 @@ get_tw: ...@@ -2032,11 +2032,11 @@ get_tw:
return NULL; return NULL;
read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
} else } else
sk = sk_next(sk); sk = sk_nulls_next(sk);
sk_for_each_from(sk, node) { sk_nulls_for_each_from(sk, node) {
if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
goto found; goto found;
} }
...@@ -2375,6 +2375,7 @@ struct proto tcp_prot = { ...@@ -2375,6 +2375,7 @@ struct proto tcp_prot = {
.sysctl_rmem = sysctl_tcp_rmem, .sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER, .max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock), .obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops, .twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops, .rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo, .h.hashinfo = &tcp_hashinfo,
......
...@@ -25,24 +25,28 @@ ...@@ -25,24 +25,28 @@
void __inet6_hash(struct sock *sk) void __inet6_hash(struct sock *sk)
{ {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_head *list;
rwlock_t *lock; rwlock_t *lock;
WARN_ON(!sk_unhashed(sk)); WARN_ON(!sk_unhashed(sk));
if (sk->sk_state == TCP_LISTEN) { if (sk->sk_state == TCP_LISTEN) {
struct hlist_head *list;
list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
lock = &hashinfo->lhash_lock; lock = &hashinfo->lhash_lock;
inet_listen_wlock(hashinfo); inet_listen_wlock(hashinfo);
__sk_add_node(sk, list);
} else { } else {
unsigned int hash; unsigned int hash;
struct hlist_nulls_head *list;
sk->sk_hash = hash = inet6_sk_ehashfn(sk); sk->sk_hash = hash = inet6_sk_ehashfn(sk);
list = &inet_ehash_bucket(hashinfo, hash)->chain; list = &inet_ehash_bucket(hashinfo, hash)->chain;
lock = inet_ehash_lockp(hashinfo, hash); lock = inet_ehash_lockp(hashinfo, hash);
write_lock(lock); write_lock(lock);
__sk_nulls_add_node_rcu(sk, list);
} }
__sk_add_node(sk, list);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock); write_unlock(lock);
} }
...@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net, ...@@ -63,33 +67,53 @@ struct sock *__inet6_lookup_established(struct net *net,
const int dif) const int dif)
{ {
struct sock *sk; struct sock *sk;
const struct hlist_node *node; const struct hlist_nulls_node *node;
const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
/* Optimize here for direct hit, only listening connections can /* Optimize here for direct hit, only listening connections can
* have wildcards anyways. * have wildcards anyways.
*/ */
unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); unsigned int slot = hash & (hashinfo->ehash_size - 1);
rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
prefetch(head->chain.first);
read_lock(lock); rcu_read_lock();
sk_for_each(sk, node, &head->chain) { begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
/* For IPV6 do the cheaper port and family tests first. */ /* For IPV6 do the cheaper port and family tests first. */
if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) if (INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
goto hit; /* You sunk my battleship! */ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
goto begintw;
if (!INET6_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
sock_put(sk);
goto begin;
}
goto out;
} }
/* Must check for a TIME_WAIT'er before going to listener hash. */
sk_for_each(sk, node, &head->twchain) {
if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif))
goto hit;
} }
read_unlock(lock); if (get_nulls_value(node) != slot)
return NULL; goto begin;
hit: begintw:
sock_hold(sk); /* Must check for a TIME_WAIT'er before going to listener hash. */
read_unlock(lock); sk_nulls_for_each_rcu(sk, node, &head->twchain) {
if (INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
sk = NULL;
goto out;
}
if (!INET6_TW_MATCH(sk, net, hash, saddr, daddr, ports, dif)) {
sock_put(sk);
goto begintw;
}
goto out;
}
}
if (get_nulls_value(node) != slot)
goto begintw;
sk = NULL;
out:
rcu_read_unlock();
return sk; return sk;
} }
EXPORT_SYMBOL(__inet6_lookup_established); EXPORT_SYMBOL(__inet6_lookup_established);
...@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, ...@@ -172,14 +196,14 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
rwlock_t *lock = inet_ehash_lockp(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2; struct sock *sk2;
const struct hlist_node *node; const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw; struct inet_timewait_sock *tw;
prefetch(head->chain.first); prefetch(head->chain.first);
write_lock(lock); write_lock(lock);
/* Check TIME-WAIT sockets first. */ /* Check TIME-WAIT sockets first. */
sk_for_each(sk2, node, &head->twchain) { sk_nulls_for_each(sk2, node, &head->twchain) {
tw = inet_twsk(sk2); tw = inet_twsk(sk2);
if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) { if (INET6_TW_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) {
...@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, ...@@ -192,7 +216,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
tw = NULL; tw = NULL;
/* And established part... */ /* And established part... */
sk_for_each(sk2, node, &head->chain) { sk_nulls_for_each(sk2, node, &head->chain) {
if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif)) if (INET6_MATCH(sk2, net, hash, saddr, daddr, ports, dif))
goto not_unique; goto not_unique;
} }
...@@ -203,7 +227,7 @@ unique: ...@@ -203,7 +227,7 @@ unique:
inet->num = lport; inet->num = lport;
inet->sport = htons(lport); inet->sport = htons(lport);
WARN_ON(!sk_unhashed(sk)); WARN_ON(!sk_unhashed(sk));
__sk_add_node(sk, &head->chain); __sk_nulls_add_node_rcu(sk, &head->chain);
sk->sk_hash = hash; sk->sk_hash = hash;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock(lock); write_unlock(lock);
......
...@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = { ...@@ -2043,6 +2043,7 @@ struct proto tcpv6_prot = {
.sysctl_rmem = sysctl_tcp_rmem, .sysctl_rmem = sysctl_tcp_rmem,
.max_header = MAX_TCP_HEADER, .max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock), .obj_size = sizeof(struct tcp6_sock),
.slab_flags = SLAB_DESTROY_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops, .twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops, .rsk_prot = &tcp6_request_sock_ops,
.h.hashinfo = &tcp_hashinfo, .h.hashinfo = &tcp_hashinfo,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment