Commit 645ca708 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

udp: introduce struct udp_table and multiple spinlocks

UDP sockets are hashed in a 128 slots hash table.

This hash table is protected by *one* rwlock.

This rwlock is readlocked each time an incoming UDP message is handled.

This rwlock is writelocked each time a socket must be inserted in
hash table (bind time), or deleted from this table (close time)

This is not scalable on SMP machines :

1) Even in read mode, lock() and unlock() are atomic operations and
 must dirty a contended cache line, shared by all cpus.

2) A writer might be starved if many readers are 'in flight'. This can
 happen on a machine with some NIC receiving many UDP messages. User
 process can be delayed a long time at socket creation/dismantle time.

This patch prepares RCU migration, by introducing 'struct udp_table
and struct udp_hslot', and using one spinlock per chain, to reduce
contention on central rwlock.

Introducing one spinlock per chain reduces latencies, for port
randomization on heavily loaded UDP servers. This also speedup
bindings to specific ports.

udp_lib_unhash() was uninlined, becoming to big.

Some cleanups were done to ease review of following patch
(RCUification of UDP Unicast lookups)
Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b189db5d
...@@ -599,7 +599,7 @@ struct proto { ...@@ -599,7 +599,7 @@ struct proto {
union { union {
struct inet_hashinfo *hashinfo; struct inet_hashinfo *hashinfo;
struct hlist_head *udp_hash; struct udp_table *udp_table;
struct raw_hashinfo *raw_hash; struct raw_hashinfo *raw_hash;
} h; } h;
......
...@@ -50,8 +50,15 @@ struct udp_skb_cb { ...@@ -50,8 +50,15 @@ struct udp_skb_cb {
}; };
#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
extern struct hlist_head udp_hash[UDP_HTABLE_SIZE]; struct udp_hslot {
extern rwlock_t udp_hash_lock; struct hlist_head head;
spinlock_t lock;
} __attribute__((aligned(2 * sizeof(long))));
struct udp_table {
struct udp_hslot hash[UDP_HTABLE_SIZE];
};
extern struct udp_table udp_table;
extern void udp_table_init(struct udp_table *);
/* Note: this must match 'valbool' in sock_setsockopt */ /* Note: this must match 'valbool' in sock_setsockopt */
...@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk) ...@@ -110,15 +117,7 @@ static inline void udp_lib_hash(struct sock *sk)
BUG(); BUG();
} }
static inline void udp_lib_unhash(struct sock *sk) extern void udp_lib_unhash(struct sock *sk);
{
write_lock_bh(&udp_hash_lock);
if (sk_del_node_init(sk)) {
inet_sk(sk)->num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
write_unlock_bh(&udp_hash_lock);
}
static inline void udp_lib_close(struct sock *sk, long timeout) static inline void udp_lib_close(struct sock *sk, long timeout)
{ {
...@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, ...@@ -187,7 +186,7 @@ extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct udp_seq_afinfo { struct udp_seq_afinfo {
char *name; char *name;
sa_family_t family; sa_family_t family;
struct hlist_head *hashtable; struct udp_table *udp_table;
struct file_operations seq_fops; struct file_operations seq_fops;
struct seq_operations seq_ops; struct seq_operations seq_ops;
}; };
...@@ -196,7 +195,7 @@ struct udp_iter_state { ...@@ -196,7 +195,7 @@ struct udp_iter_state {
struct seq_net_private p; struct seq_net_private p;
sa_family_t family; sa_family_t family;
int bucket; int bucket;
struct hlist_head *hashtable; struct udp_table *udp_table;
}; };
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ #define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */
extern struct proto udplite_prot; extern struct proto udplite_prot;
extern struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; extern struct udp_table udplite_table;
/* /*
* Checksum computation is all in software, hence simpler getfrag. * Checksum computation is all in software, hence simpler getfrag.
......
This diff is collapsed.
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
#include <net/protocol.h> #include <net/protocol.h>
#include <net/inet_common.h> #include <net/inet_common.h>
extern int __udp4_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int );
extern void __udp4_lib_err(struct sk_buff *, u32, struct hlist_head []); extern void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
extern int udp_v4_get_port(struct sock *sk, unsigned short snum); extern int udp_v4_get_port(struct sock *sk, unsigned short snum);
......
...@@ -12,16 +12,17 @@ ...@@ -12,16 +12,17 @@
*/ */
#include "udp_impl.h" #include "udp_impl.h"
struct hlist_head udplite_hash[UDP_HTABLE_SIZE]; struct udp_table udplite_table;
EXPORT_SYMBOL(udplite_table);
static int udplite_rcv(struct sk_buff *skb) static int udplite_rcv(struct sk_buff *skb)
{ {
return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
} }
static void udplite_err(struct sk_buff *skb, u32 info) static void udplite_err(struct sk_buff *skb, u32 info)
{ {
__udp4_lib_err(skb, info, udplite_hash); __udp4_lib_err(skb, info, &udplite_table);
} }
static struct net_protocol udplite_protocol = { static struct net_protocol udplite_protocol = {
...@@ -50,7 +51,7 @@ struct proto udplite_prot = { ...@@ -50,7 +51,7 @@ struct proto udplite_prot = {
.unhash = udp_lib_unhash, .unhash = udp_lib_unhash,
.get_port = udp_v4_get_port, .get_port = udp_v4_get_port,
.obj_size = sizeof(struct udp_sock), .obj_size = sizeof(struct udp_sock),
.h.udp_hash = udplite_hash, .h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt, .compat_setsockopt = compat_udp_setsockopt,
.compat_getsockopt = compat_udp_getsockopt, .compat_getsockopt = compat_udp_getsockopt,
...@@ -71,7 +72,7 @@ static struct inet_protosw udplite4_protosw = { ...@@ -71,7 +72,7 @@ static struct inet_protosw udplite4_protosw = {
static struct udp_seq_afinfo udplite4_seq_afinfo = { static struct udp_seq_afinfo udplite4_seq_afinfo = {
.name = "udplite", .name = "udplite",
.family = AF_INET, .family = AF_INET,
.hashtable = udplite_hash, .udp_table = &udplite_table,
.seq_fops = { .seq_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
}, },
...@@ -108,6 +109,7 @@ static inline int udplite4_proc_init(void) ...@@ -108,6 +109,7 @@ static inline int udplite4_proc_init(void)
void __init udplite4_register(void) void __init udplite4_register(void)
{ {
udp_table_init(&udplite_table);
if (proto_register(&udplite_prot, 1)) if (proto_register(&udplite_prot, 1))
goto out_register_err; goto out_register_err;
...@@ -126,5 +128,4 @@ out_register_err: ...@@ -126,5 +128,4 @@ out_register_err:
printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__); printk(KERN_CRIT "%s: Cannot add UDP-Lite protocol.\n", __func__);
} }
EXPORT_SYMBOL(udplite_hash);
EXPORT_SYMBOL(udplite_prot); EXPORT_SYMBOL(udplite_prot);
...@@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum) ...@@ -54,62 +54,73 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal); return udp_lib_get_port(sk, snum, ipv6_rcv_saddr_equal);
} }
static struct sock *__udp6_lib_lookup(struct net *net, static inline int compute_score(struct sock *sk, struct net *net,
unsigned short hnum,
struct in6_addr *saddr, __be16 sport, struct in6_addr *saddr, __be16 sport,
struct in6_addr *daddr, __be16 dport, struct in6_addr *daddr, __be16 dport,
int dif, struct hlist_head udptable[]) int dif)
{ {
struct sock *sk, *result = NULL; int score = -1;
struct hlist_node *node;
unsigned short hnum = ntohs(dport);
int badness = -1;
read_lock(&udp_hash_lock);
sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) {
struct inet_sock *inet = inet_sk(sk);
if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
sk->sk_family == PF_INET6) { sk->sk_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk);
int score = 0; struct inet_sock *inet = inet_sk(sk);
score = 0;
if (inet->dport) { if (inet->dport) {
if (inet->dport != sport) if (inet->dport != sport)
continue; return -1;
score++; score++;
} }
if (!ipv6_addr_any(&np->rcv_saddr)) { if (!ipv6_addr_any(&np->rcv_saddr)) {
if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) if (!ipv6_addr_equal(&np->rcv_saddr, daddr))
continue; return -1;
score++; score++;
} }
if (!ipv6_addr_any(&np->daddr)) { if (!ipv6_addr_any(&np->daddr)) {
if (!ipv6_addr_equal(&np->daddr, saddr)) if (!ipv6_addr_equal(&np->daddr, saddr))
continue; return -1;
score++; score++;
} }
if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if) {
if (sk->sk_bound_dev_if != dif) if (sk->sk_bound_dev_if != dif)
continue; return -1;
score++; score++;
} }
if (score == 4) { }
result = sk; return score;
break; }
} else if (score > badness) {
static struct sock *__udp6_lib_lookup(struct net *net,
struct in6_addr *saddr, __be16 sport,
struct in6_addr *daddr, __be16 dport,
int dif, struct udp_table *udptable)
{
struct sock *sk, *result = NULL;
struct hlist_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash = udp_hashfn(net, hnum);
struct udp_hslot *hslot = &udptable->hash[hash];
int score, badness = -1;
spin_lock(&hslot->lock);
sk_for_each(sk, node, &hslot->head) {
score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
if (score > badness) {
result = sk; result = sk;
badness = score; badness = score;
} }
} }
}
if (result) if (result)
sock_hold(result); sock_hold(result);
read_unlock(&udp_hash_lock); spin_unlock(&hslot->lock);
return result; return result;
} }
static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
__be16 sport, __be16 dport, __be16 sport, __be16 dport,
struct hlist_head udptable[]) struct udp_table *udptable)
{ {
struct sock *sk; struct sock *sk;
struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6hdr *iph = ipv6_hdr(skb);
...@@ -239,7 +250,7 @@ csum_copy_err: ...@@ -239,7 +250,7 @@ csum_copy_err:
void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
int type, int code, int offset, __be32 info, int type, int code, int offset, __be32 info,
struct hlist_head udptable[] ) struct udp_table *udptable)
{ {
struct ipv6_pinfo *np; struct ipv6_pinfo *np;
struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data;
...@@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, ...@@ -275,7 +286,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt, int type, struct inet6_skb_parm *opt, int type,
int code, int offset, __be32 info ) int code, int offset, __be32 info )
{ {
__udp6_lib_err(skb, opt, type, code, offset, info, udp_hash); __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
} }
int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
...@@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk, ...@@ -374,14 +385,15 @@ static struct sock *udp_v6_mcast_next(struct sock *sk,
*/ */
static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct in6_addr *saddr, struct in6_addr *daddr, struct in6_addr *saddr, struct in6_addr *daddr,
struct hlist_head udptable[]) struct udp_table *udptable)
{ {
struct sock *sk, *sk2; struct sock *sk, *sk2;
const struct udphdr *uh = udp_hdr(skb); const struct udphdr *uh = udp_hdr(skb);
struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
int dif; int dif;
read_lock(&udp_hash_lock); spin_lock(&hslot->lock);
sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); sk = sk_head(&hslot->head);
dif = inet6_iif(skb); dif = inet6_iif(skb);
sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
if (!sk) { if (!sk) {
...@@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, ...@@ -409,7 +421,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
sk_add_backlog(sk, skb); sk_add_backlog(sk, skb);
bh_unlock_sock(sk); bh_unlock_sock(sk);
out: out:
read_unlock(&udp_hash_lock); spin_unlock(&hslot->lock);
return 0; return 0;
} }
...@@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, ...@@ -447,7 +459,7 @@ static inline int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh,
return 0; return 0;
} }
int __udp6_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto) int proto)
{ {
struct sock *sk; struct sock *sk;
...@@ -544,7 +556,7 @@ discard: ...@@ -544,7 +556,7 @@ discard:
static __inline__ int udpv6_rcv(struct sk_buff *skb) static __inline__ int udpv6_rcv(struct sk_buff *skb)
{ {
return __udp6_lib_rcv(skb, udp_hash, IPPROTO_UDP); return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
} }
/* /*
...@@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v) ...@@ -1008,7 +1020,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
static struct udp_seq_afinfo udp6_seq_afinfo = { static struct udp_seq_afinfo udp6_seq_afinfo = {
.name = "udp6", .name = "udp6",
.family = AF_INET6, .family = AF_INET6,
.hashtable = udp_hash, .udp_table = &udp_table,
.seq_fops = { .seq_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
}, },
...@@ -1050,7 +1062,7 @@ struct proto udpv6_prot = { ...@@ -1050,7 +1062,7 @@ struct proto udpv6_prot = {
.sysctl_wmem = &sysctl_udp_wmem_min, .sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min, .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp6_sock), .obj_size = sizeof(struct udp6_sock),
.h.udp_hash = udp_hash, .h.udp_table = &udp_table,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt, .compat_setsockopt = compat_udpv6_setsockopt,
.compat_getsockopt = compat_udpv6_getsockopt, .compat_getsockopt = compat_udpv6_getsockopt,
......
...@@ -7,9 +7,9 @@ ...@@ -7,9 +7,9 @@
#include <net/inet_common.h> #include <net/inet_common.h>
#include <net/transp_v6.h> #include <net/transp_v6.h>
extern int __udp6_lib_rcv(struct sk_buff *, struct hlist_head [], int ); extern int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int );
extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, extern void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *,
int , int , int , __be32 , struct hlist_head []); int , int , int , __be32 , struct udp_table *);
extern int udp_v6_get_port(struct sock *sk, unsigned short snum); extern int udp_v6_get_port(struct sock *sk, unsigned short snum);
......
...@@ -15,14 +15,14 @@ ...@@ -15,14 +15,14 @@
static int udplitev6_rcv(struct sk_buff *skb) static int udplitev6_rcv(struct sk_buff *skb)
{ {
return __udp6_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE); return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
} }
static void udplitev6_err(struct sk_buff *skb, static void udplitev6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt, struct inet6_skb_parm *opt,
int type, int code, int offset, __be32 info) int type, int code, int offset, __be32 info)
{ {
__udp6_lib_err(skb, opt, type, code, offset, info, udplite_hash); __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
} }
static struct inet6_protocol udplitev6_protocol = { static struct inet6_protocol udplitev6_protocol = {
...@@ -49,7 +49,7 @@ struct proto udplitev6_prot = { ...@@ -49,7 +49,7 @@ struct proto udplitev6_prot = {
.unhash = udp_lib_unhash, .unhash = udp_lib_unhash,
.get_port = udp_v6_get_port, .get_port = udp_v6_get_port,
.obj_size = sizeof(struct udp6_sock), .obj_size = sizeof(struct udp6_sock),
.h.udp_hash = udplite_hash, .h.udp_table = &udplite_table,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udpv6_setsockopt, .compat_setsockopt = compat_udpv6_setsockopt,
.compat_getsockopt = compat_udpv6_getsockopt, .compat_getsockopt = compat_udpv6_getsockopt,
...@@ -95,7 +95,7 @@ void udplitev6_exit(void) ...@@ -95,7 +95,7 @@ void udplitev6_exit(void)
static struct udp_seq_afinfo udplite6_seq_afinfo = { static struct udp_seq_afinfo udplite6_seq_afinfo = {
.name = "udplite6", .name = "udplite6",
.family = AF_INET6, .family = AF_INET6,
.hashtable = udplite_hash, .udp_table = &udplite_table,
.seq_fops = { .seq_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
}, },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment