net: preempt-rt support

Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

net: preempt-rt support
Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
0f2c3c2b · Ingo Molnar · Thomas Gleixner · 586a6377 · 0f2c3c2b · 0f2c3c2b
Commit 0f2c3c2b authored Jul 03, 2009 by Ingo Molnar Committed by Thomas Gleixner Jul 29, 2009
7 changed files
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1674,14 +1674,14 @@ static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
-	txq->xmit_lock_owner = smp_processor_id();
+	txq->xmit_lock_owner = raw_smp_processor_id();
 }
 static inline int __netif_tx_trylock(struct netdev_queue *txq)
 {
 	int ok = spin_trylock(&txq->_xmit_lock);
 	if (likely(ok))
-		txq->xmit_lock_owner = smp_processor_id();
+		txq->xmit_lock_owner = raw_smp_processor_id();
 	return ok;
 }
@@ -1715,7 +1715,7 @@ static inline void netif_tx_lock(struct net_device *dev)
 	int cpu;
 	spin_lock(&dev->tx_global_lock);
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
@@ -1779,7 +1779,7 @@ static inline void netif_tx_disable(struct net_device *dev)
 	int cpu;
 	local_bh_disable();
-	cpu = smp_processor_id();
+	cpu = raw_smp_processor_id();
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1888,9 +1888,16 @@ gso:
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		int cpu = raw_smp_processor_id(); /* ok because BHs are off */
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (txq->xmit_lock_owner != cpu) {
+#endif
 			HARD_TX_LOCK(dev, txq, cpu);
@@ -2008,7 +2015,8 @@ EXPORT_SYMBOL(netif_rx_ni);
 static void net_tx_action(struct softirq_action *h)
 {
-	struct softnet_data *sd = &__get_cpu_var(softnet_data);
+	struct softnet_data *sd = &per_cpu(softnet_data,
+					   raw_smp_processor_id());
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
@@ -2024,6 +2032,11 @@ static void net_tx_action(struct softirq_action *h)
 			WARN_ON(atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_softirq_context();
 		}
 	}
@@ -2042,6 +2055,22 @@ static void net_tx_action(struct softirq_action *h)
 			head = head->next_sched;
 			root_lock = qdisc_lock(q);
+			/*
+			 * We are executing in softirq context here, and
+			 * if softirqs are preemptible, we must avoid
+			 * infinite reactivation of the softirq by
+			 * either the tx handler, or by netif_schedule().
+			 * (it would result in an infinitely looping
+			 *  softirq context)
+			 * So we take the spinlock unconditionally.
+			 */
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+			spin_lock(root_lock);
+			smp_mb__before_clear_bit();
+			clear_bit(__QDISC_STATE_SCHED, &q->state);
+			qdisc_run(q);
+			spin_unlock(root_lock);
+#else
 			if (spin_trylock(root_lock)) {
 				smp_mb__before_clear_bit();
 				clear_bit(__QDISC_STATE_SCHED,
@@ -2058,6 +2087,7 @@ static void net_tx_action(struct softirq_action *h)
 						  &q->state);
 				}
 			}
+#endif
 		}
 	}
 }
@@ -2270,7 +2300,7 @@ int netif_receive_skb(struct sk_buff *skb)
 			skb->dev = orig_dev->master;
 	}
-	__get_cpu_var(netdev_rx_stat).total++;
+	per_cpu(netdev_rx_stat, raw_smp_processor_id()).total++;
 	skb_reset_network_header(skb);
 	skb_reset_transport_header(skb);
@@ -2660,9 +2690,10 @@ EXPORT_SYMBOL(napi_gro_frags);
 static int process_backlog(struct napi_struct *napi, int quota)
 {
 	int work = 0;
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
+	queue = &per_cpu(softnet_data, raw_smp_processor_id());
 	napi->weight = weight_p;
 	do {
 		struct sk_buff *skb;
@@ -2694,7 +2725,7 @@ void __napi_schedule(struct napi_struct *n)
 	local_irq_save(flags);
 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(__napi_schedule);

--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -69,20 +69,20 @@ static void queue_process(struct work_struct *work)
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		__netif_tx_lock(txq, smp_processor_id());
 		if (netif_tx_queue_stopped(txq) ||
 		    netif_tx_queue_frozen(txq) ||
 		    ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
 			skb_queue_head(&npinfo->txq, skb);
 			__netif_tx_unlock(txq);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 			schedule_delayed_work(&npinfo->tx_work, HZ/10);
 			return;
 		}
 		__netif_tx_unlock(txq);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 }
@@ -153,7 +153,7 @@ static void poll_napi(struct net_device *dev)
 	int budget = 16;
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner != smp_processor_id() &&
+		if (napi->poll_owner != raw_smp_processor_id() &&
 		    spin_trylock(&napi->poll_lock)) {
 			budget = poll_one_napi(dev->npinfo, napi, budget);
 			spin_unlock(&napi->poll_lock);
@@ -214,30 +214,35 @@ static void refill_skbs(void)
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if (skb->destructor) {
-				atomic_inc(&skb->users);
-				dev_kfree_skb_any(skb); /* put this one back */
-			} else {
-				__kfree_skb(skb);
-			}
-		}
 	}
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if (skb->destructor) {
+			atomic_inc(&skb->users);
+			dev_kfree_skb_any(skb); /* put this one back */
+		} else {
+			__kfree_skb(skb);
+		}
+	}
 }
 static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
@@ -245,13 +250,26 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
 	int count = 0;
 	struct sk_buff *skb;
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * On -rt skb_pool.lock is schedulable, so if we are
+	 * in an atomic context we just try to dequeue from the
+	 * pool and fail if we cannot get one.
+	 */
+	if (in_atomic() || irqs_disabled())
+		goto pick_atomic;
+#endif
 	zap_completion_queue();
 	refill_skbs();
 repeat:
 	skb = alloc_skb(len, GFP_ATOMIC);
-	if (!skb)
+	if (!skb) {
+#ifdef CONFIG_PREEMPT_RT
+pick_atomic:
+#endif
 		skb = skb_dequeue(&skb_pool);
+	}
 	if (!skb) {
 		if (++count < 10) {
@@ -271,7 +289,7 @@ static int netpoll_owner_active(struct net_device *dev)
 	struct napi_struct *napi;
 	list_for_each_entry(napi, &dev->napi_list, dev_list) {
-		if (napi->poll_owner == smp_processor_id())
+		if (napi->poll_owner == raw_smp_processor_id())
 			return 1;
 	}
 	return 0;
@@ -297,7 +315,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* try until next clock tick */
 		for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
 		     tries > 0; --tries) {
@@ -319,7 +337,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 			udelay(USEC_PER_POLL);
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	if (status != NETDEV_TX_OK) {

--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -201,7 +201,10 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
 */
 static struct sock *icmp_sk(struct net *net)
 {
-	return net->ipv4.icmp_sk[smp_processor_id()];
+	/*
+	 * Should be safe on PREEMPT_SOFTIRQS/HARDIRQS to use raw-smp-processor-id:
+	 */
+	return net->ipv4.icmp_sk[raw_smp_processor_id()];
 }
 static inline struct sock *icmp_xmit_lock(struct net *net)

--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -204,13 +204,13 @@ struct rt_hash_bucket {
 };
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
+	defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_PREEMPT_RT)
 /*
 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 * The size of this table is a power of two and depends on the number of CPUS.
 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 */
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT)
 # define RT_HASH_LOCK_SZ	256
 #else
 # if NR_CPUS >= 32

--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -375,7 +375,7 @@ ip6t_do_table(struct sk_buff *skb,
 	xt_info_rdlock_bh();
 	private = table->private;
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[raw_smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);

--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -12,6 +12,7 @@
 */
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -24,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/pkt_sched.h>
 /* Main transmission queue. */
@@ -78,7 +80,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
 {
 	int ret;
-	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
+	if (unlikely(dev_queue->xmit_lock_owner == raw_smp_processor_id())) {
 		/*
 		 * Same CPU holding the lock. It may be a transient
 		 * configuration error, when hard_start_xmit() recurses. We
@@ -141,7 +143,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
-	HARD_TX_LOCK(dev, txq, smp_processor_id());
+	HARD_TX_LOCK(dev, txq, raw_smp_processor_id());
 	if (!netif_tx_queue_stopped(txq) &&
 	    !netif_tx_queue_frozen(txq))
 		ret = dev_hard_start_xmit(skb, dev, txq);
@@ -713,9 +715,12 @@ void dev_deactivate(struct net_device *dev)
 	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
 	synchronize_rcu();
-	/* Wait for outstanding qdisc_run calls. */
+	/*
+	 * Wait for outstanding qdisc_run calls.
+	 * TODO: shouldnt this be wakeup-based, instead of polling it?
+	 */
 	while (some_qdisc_is_busy(dev))
-		yield();
+		msleep(1);
 }
 static void dev_init_scheduler_queue(struct net_device *dev,