Commit 85670cc1 authored by Patrick McHardy's avatar Patrick McHardy Committed by David S. Miller

[NET_SCHED]: Fix fallout from dev->qdisc RCU change

The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.

The two assumptions were:

- since changes only happen in process context, read_lock doesn't need
  bottem half protection. Now invalid since destruction of inner qdiscs,
  classifiers, actions and estimators happens in the RCU callback unless
  they're manually deleted, resulting in dead-locks when read_lock in
  process context is interrupted by write_lock_bh in bottem half context.

- since changes only happen under the RTNL, no additional locking is
  necessary for data not used during packet processing (f.e. u32_list).
  Again, since destruction now happens in the RCU callback, this assumption
  is not valid anymore, causing races while using this data, which can
  result in corruption or use-after-free.

Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 787e0617
...@@ -1480,15 +1480,17 @@ gso: ...@@ -1480,15 +1480,17 @@ gso:
if (q->enqueue) { if (q->enqueue) {
/* Grab device queue */ /* Grab device queue */
spin_lock(&dev->queue_lock); spin_lock(&dev->queue_lock);
q = dev->qdisc;
if (q->enqueue) {
rc = q->enqueue(skb, q); rc = q->enqueue(skb, q);
qdisc_run(dev); qdisc_run(dev);
spin_unlock(&dev->queue_lock); spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out; goto out;
} }
spin_unlock(&dev->queue_lock);
}
/* The device has no queue. Common case for software devices: /* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels... loopback, all the sorts of tunnels...
......
...@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
return skb->len; return skb->len;
read_lock_bh(&qdisc_tree_lock); read_lock(&qdisc_tree_lock);
if (!tcm->tcm_parent) if (!tcm->tcm_parent)
q = dev->qdisc_sleeping; q = dev->qdisc_sleeping;
else else
...@@ -458,7 +458,7 @@ errout: ...@@ -458,7 +458,7 @@ errout:
if (cl) if (cl)
cops->put(q, cl); cops->put(q, cl);
out: out:
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
dev_put(dev); dev_put(dev);
return skb->len; return skb->len;
} }
......
...@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) ...@@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
{ {
struct Qdisc *q; struct Qdisc *q;
read_lock_bh(&qdisc_tree_lock); read_lock(&qdisc_tree_lock);
list_for_each_entry(q, &dev->qdisc_list, list) { list_for_each_entry(q, &dev->qdisc_list, list) {
if (q->handle == handle) { if (q->handle == handle) {
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
return q; return q;
} }
} }
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
return NULL; return NULL;
} }
...@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
continue; continue;
if (idx > s_idx) if (idx > s_idx)
s_q_idx = 0; s_q_idx = 0;
read_lock_bh(&qdisc_tree_lock); read_lock(&qdisc_tree_lock);
q_idx = 0; q_idx = 0;
list_for_each_entry(q, &dev->qdisc_list, list) { list_for_each_entry(q, &dev->qdisc_list, list) {
if (q_idx < s_q_idx) { if (q_idx < s_q_idx) {
...@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
} }
if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
goto done; goto done;
} }
q_idx++; q_idx++;
} }
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
} }
done: done:
...@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
s_t = cb->args[0]; s_t = cb->args[0];
t = 0; t = 0;
read_lock_bh(&qdisc_tree_lock); read_lock(&qdisc_tree_lock);
list_for_each_entry(q, &dev->qdisc_list, list) { list_for_each_entry(q, &dev->qdisc_list, list) {
if (t < s_t || !q->ops->cl_ops || if (t < s_t || !q->ops->cl_ops ||
(tcm->tcm_parent && (tcm->tcm_parent &&
...@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
break; break;
t++; t++;
} }
read_unlock_bh(&qdisc_tree_lock); read_unlock(&qdisc_tree_lock);
cb->args[0] = t; cb->args[0] = t;
......
...@@ -45,11 +45,10 @@ ...@@ -45,11 +45,10 @@
The idea is the following: The idea is the following:
- enqueue, dequeue are serialized via top level device - enqueue, dequeue are serialized via top level device
spinlock dev->queue_lock. spinlock dev->queue_lock.
- tree walking is protected by read_lock_bh(qdisc_tree_lock) - tree walking is protected by read_lock(qdisc_tree_lock)
and this lock is used only in process context. and this lock is used only in process context.
- updates to tree are made under rtnl semaphore or - updates to tree are made only under rtnl semaphore,
from softirq context (__qdisc_destroy rcu-callback) hence this lock may be made without local bh disabling.
hence this lock needs local bh disabling.
qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! qdisc_tree_lock must be grabbed BEFORE dev->queue_lock!
*/ */
...@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); ...@@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock);
void qdisc_lock_tree(struct net_device *dev) void qdisc_lock_tree(struct net_device *dev)
{ {
write_lock_bh(&qdisc_tree_lock); write_lock(&qdisc_tree_lock);
spin_lock_bh(&dev->queue_lock); spin_lock_bh(&dev->queue_lock);
} }
void qdisc_unlock_tree(struct net_device *dev) void qdisc_unlock_tree(struct net_device *dev)
{ {
spin_unlock_bh(&dev->queue_lock); spin_unlock_bh(&dev->queue_lock);
write_unlock_bh(&qdisc_tree_lock); write_unlock(&qdisc_tree_lock);
} }
/* /*
...@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc) ...@@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc)
static void __qdisc_destroy(struct rcu_head *head) static void __qdisc_destroy(struct rcu_head *head)
{ {
struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
struct Qdisc_ops *ops = qdisc->ops;
#ifdef CONFIG_NET_ESTIMATOR
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
#endif
write_lock(&qdisc_tree_lock);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
ops->destroy(qdisc);
write_unlock(&qdisc_tree_lock);
module_put(ops->owner);
dev_put(qdisc->dev);
kfree((char *) qdisc - qdisc->padded); kfree((char *) qdisc - qdisc->padded);
} }
...@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head) ...@@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head)
void qdisc_destroy(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc)
{ {
struct list_head cql = LIST_HEAD_INIT(cql); struct Qdisc_ops *ops = qdisc->ops;
struct Qdisc *cq, *q, *n;
if (qdisc->flags & TCQ_F_BUILTIN || if (qdisc->flags & TCQ_F_BUILTIN ||
!atomic_dec_and_test(&qdisc->refcnt)) !atomic_dec_and_test(&qdisc->refcnt))
return; return;
if (!list_empty(&qdisc->list)) {
if (qdisc->ops->cl_ops == NULL)
list_del(&qdisc->list); list_del(&qdisc->list);
else #ifdef CONFIG_NET_ESTIMATOR
list_move(&qdisc->list, &cql); gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
} #endif
if (ops->reset)
/* unlink inner qdiscs from dev->qdisc_list immediately */ ops->reset(qdisc);
list_for_each_entry(cq, &cql, list) if (ops->destroy)
list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) ops->destroy(qdisc);
if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) {
if (q->ops->cl_ops == NULL)
list_del_init(&q->list);
else
list_move_tail(&q->list, &cql);
}
list_for_each_entry_safe(cq, n, &cql, list)
list_del_init(&cq->list);
module_put(ops->owner);
dev_put(qdisc->dev);
call_rcu(&qdisc->q_rcu, __qdisc_destroy); call_rcu(&qdisc->q_rcu, __qdisc_destroy);
} }
...@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev) ...@@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev)
printk(KERN_INFO "%s: activation failed\n", dev->name); printk(KERN_INFO "%s: activation failed\n", dev->name);
return; return;
} }
write_lock_bh(&qdisc_tree_lock); write_lock(&qdisc_tree_lock);
list_add_tail(&qdisc->list, &dev->qdisc_list); list_add_tail(&qdisc->list, &dev->qdisc_list);
write_unlock_bh(&qdisc_tree_lock); write_unlock(&qdisc_tree_lock);
} else { } else {
qdisc = &noqueue_qdisc; qdisc = &noqueue_qdisc;
} }
write_lock_bh(&qdisc_tree_lock); write_lock(&qdisc_tree_lock);
dev->qdisc_sleeping = qdisc; dev->qdisc_sleeping = qdisc;
write_unlock_bh(&qdisc_tree_lock); write_unlock(&qdisc_tree_lock);
} }
if (!netif_carrier_ok(dev)) if (!netif_carrier_ok(dev))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment