Commit 346bf891 authored by Thomas Gleixner's avatar Thomas Gleixner

Merge branch 'rt/head' into rt/2.6.31

parents 5440ba2c 465a3c40
......@@ -95,6 +95,7 @@
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_APERFMPERF (3*32+27) /* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
......
......@@ -27,6 +27,7 @@ struct mm_struct;
#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/init.h>
/*
......@@ -1010,4 +1011,33 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
struct aperfmperf {
u64 aperf, mperf;
};
static inline void get_aperfmperf(struct aperfmperf *am)
{
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
rdmsrl(MSR_IA32_APERF, am->aperf);
rdmsrl(MSR_IA32_MPERF, am->mperf);
}
#define APERFMPERF_SHIFT 10
static inline
unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
struct aperfmperf *new)
{
u64 aperf = new->aperf - old->aperf;
u64 mperf = new->mperf - old->mperf;
unsigned long ratio = aperf;
mperf >>= APERFMPERF_SHIFT;
if (mperf)
ratio = div64_u64(aperf, mperf);
return ratio;
}
#endif /* _ASM_X86_PROCESSOR_H */
......@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o
obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
......
......@@ -60,7 +60,6 @@ enum {
};
#define INTEL_MSR_RANGE (0xffff)
#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
struct acpi_cpufreq_data {
struct acpi_processor_performance *acpi_data;
......@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
struct acpi_msr_data {
u64 saved_aperf, saved_mperf;
};
static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
static DEFINE_PER_CPU(struct aperfmperf, old_perf);
DEFINE_TRACE(power_mark);
......@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
struct perf_pair {
union {
struct {
u32 lo;
u32 hi;
} split;
u64 whole;
} aperf, mperf;
};
/* Called via smp_call_function_single(), on the target CPU */
static void read_measured_perf_ctrs(void *_cur)
{
struct perf_pair *cur = _cur;
struct aperfmperf *am = _cur;
rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
get_aperfmperf(am);
}
/*
......@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int cpu)
{
struct perf_pair readin, cur;
unsigned int perf_percent;
struct aperfmperf perf;
unsigned long ratio;
unsigned int retval;
if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
return 0;
cur.aperf.whole = readin.aperf.whole -
per_cpu(msr_data, cpu).saved_aperf;
cur.mperf.whole = readin.mperf.whole -
per_cpu(msr_data, cpu).saved_mperf;
per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
#ifdef __i386__
/*
* We dont want to do 64 bit divide with 32 bit kernel
* Get an approximate value. Return failure in case we cannot get
* an approximate value.
*/
if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
int shift_count;
u32 h;
h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
shift_count = fls(h);
cur.aperf.whole >>= shift_count;
cur.mperf.whole >>= shift_count;
}
if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
int shift_count = 7;
cur.aperf.split.lo >>= shift_count;
cur.mperf.split.lo >>= shift_count;
}
if (cur.aperf.split.lo && cur.mperf.split.lo)
perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
else
perf_percent = 0;
ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
per_cpu(old_perf, cpu) = perf;
#else
if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
int shift_count = 7;
cur.aperf.whole >>= shift_count;
cur.mperf.whole >>= shift_count;
}
if (cur.aperf.whole && cur.mperf.whole)
perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
else
perf_percent = 0;
#endif
retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
return retval;
}
......@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
acpi_processor_notify_smm(THIS_MODULE);
/* Check for APERF/MPERF support in hardware */
if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
unsigned int ecx;
ecx = cpuid_ecx(6);
if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
acpi_cpufreq_driver.getavg = get_measured_perf;
}
if (cpu_has(c, X86_FEATURE_APERFMPERF))
acpi_cpufreq_driver.getavg = get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
......
......@@ -349,6 +349,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (c->cpuid_level > 6) {
unsigned ecx = cpuid_ecx(6);
if (ecx & 0x01)
set_cpu_cap(c, X86_FEATURE_APERFMPERF);
}
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
......
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/irqflags.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
static DEFINE_PER_CPU(struct aperfmperf, old_aperfmperf);
static unsigned long scale_aperfmperf(void)
{
struct aperfmperf cur, val, *old = &__get_cpu_var(old_aperfmperf);
unsigned long ratio = SCHED_LOAD_SCALE;
unsigned long flags;
local_irq_save(flags);
get_aperfmperf(&val);
local_irq_restore(flags);
cur = val;
cur.aperf -= old->aperf;
cur.mperf -= old->mperf;
*old = val;
cur.mperf >>= SCHED_LOAD_SHIFT;
if (cur.mperf)
ratio = div_u64(cur.aperf, cur.mperf);
return ratio;
}
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
/*
* do aperf/mperf on the cpu level because it includes things
* like turbo mode, which are relevant to full cores.
*/
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
return scale_aperfmperf();
/*
* maybe have something cpufreq here
*/
return default_scale_freq_power(sd, cpu);
}
unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
/*
* aperf/mperf already includes the smt gain
*/
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
return SCHED_LOAD_SCALE;
return default_scale_smt_power(sd, cpu);
}
......@@ -843,18 +843,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 4 /* Balance on exec */
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
......@@ -874,7 +875,7 @@ static inline int sd_balance_for_mc_power(void)
if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
return 0;
return SD_PREFER_SIBLING;
}
static inline int sd_balance_for_package_power(void)
......@@ -882,7 +883,7 @@ static inline int sd_balance_for_package_power(void)
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
return 0;
return SD_PREFER_SIBLING;
}
/*
......@@ -904,15 +905,9 @@ struct sched_group {
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU).
* Note : Never change cpu_power without recompute its reciprocal
*/
unsigned int __cpu_power;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
* single CPU.
*/
u32 reciprocal_cpu_power;
unsigned int cpu_power;
/*
* The CPUs this group covers.
......@@ -965,6 +960,7 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */
enum sched_domain_level level;
......@@ -1051,6 +1047,10 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
}
#endif /* !CONFIG_SMP */
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
struct io_context; /* See blkdev.h */
......@@ -1913,6 +1913,7 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
......
......@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| SD_SHARE_CPUPOWER, \
.last_balance = jiffies, \
.balance_interval = 1, \
.smt_gain = 1178, /* 15% */ \
}
#endif
#endif /* CONFIG_SCHED_SMT */
......
......@@ -89,33 +89,36 @@ struct futex_pi_state {
union futex_key key;
};
/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
*
* We use this hashed waitqueue, instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiter, then make the second condition true.
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
* the rt_mutex code. See unqueue_me_pi().
*/
struct futex_q {
struct plist_node list;
/* Waiter reference */
struct task_struct *task;
/* Which hash list lock to use: */
struct task_struct *task;
spinlock_t *lock_ptr;
/* Key which the futex is hashed on: */
union futex_key key;
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
/* rt_waiter storage for requeue_pi: */
struct rt_mutex_waiter *rt_waiter;
/* Bitset for the optional bitmasked wakeup */
union futex_key *requeue_pi_key;
u32 bitset;
};
......@@ -147,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
*/
static inline int match_futex(union futex_key *key1, union futex_key *key2)
{
return (key1->both.word == key2->both.word
return (key1 && key2
&& key1->both.word == key2->both.word
&& key1->both.ptr == key2->both.ptr
&& key1->both.offset == key2->both.offset);
}
......@@ -195,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key)
}
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
......@@ -285,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key)
drop_futex_key_refs(key);
}
/*
* fault_in_user_writeable - fault in user address and verify RW access
/**
* fault_in_user_writeable() - Fault in user address and verify RW access
* @uaddr: pointer to faulting user space address
*
* Slow path to fixup the fault we just took in the atomic write
......@@ -306,8 +311,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
* @hb: the hash bucket the futex_q's reside in
* @key: the futex key (to distinguish it from other futex futex_q's)
*
* Must be called with the hb lock held.
*/
......@@ -585,7 +590,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
}
/**
* futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
* @hb: the pi futex hash bucket
* @key: the futex key associated with uaddr and hb
......@@ -1008,9 +1013,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
/**
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
* q: the futex_q
* key: the key of the requeue target futex
* hb: the hash_bucket of the requeue target futex
* @q: the futex_q
* @key: the key of the requeue target futex
* @hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
* target futex if it is uncontended or via a lock steal. Set the futex_q key
......@@ -1024,7 +1029,6 @@ static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
struct futex_hash_bucket *hb)
{
drop_futex_key_refs(&q->key);
get_futex_key_refs(key);
q->key = *key;
......@@ -1089,6 +1093,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (!top_waiter)
return 0;
/* Ensure we requeue to the expected futex. */
if (!match_futex(top_waiter->requeue_pi_key, key2))
return -EINVAL;
/*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
* the contended case or if set_waiters is 1. The pi_state is returned
......@@ -1218,6 +1226,7 @@ retry_private:
*/
if (ret == 1) {
WARN_ON(pi_state);
drop_count++;
task_count++;
ret = get_futex_value_locked(&curval2, uaddr2);
if (!ret)
......@@ -1276,6 +1285,12 @@ retry_private:
continue;
}
/* Ensure we requeue to the expected futex for requeue_pi. */
if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
ret = -EINVAL;
break;
}
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
......@@ -1290,6 +1305,7 @@ retry_private:
if (ret == 1) {
/* We got the lock. */
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
} else if (ret) {
/* -EDEADLK */
......@@ -1337,6 +1353,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
return hb;
}
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
drop_futex_key_refs(&q->key);
}
/**
* queue_me() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
*
* The hb->lock must be held by the caller, and is released here. A call to
* queue_me() is typically paired with exactly one call to unqueue_me(). The
* exceptions involve the PI related operations, which may use unqueue_me_pi()
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
*/
static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
......@@ -1360,19 +1395,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
spin_unlock(&hb->lock);
}
static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
drop_futex_key_refs(&q->key);
}
/*
* queue_me and unqueue_me must be called as a pair, each
* exactly once. They are called with the hashed spinlock held.
/**
* unqueue_me() - Remove the futex_q from its futex_hash_bucket
* @q: The futex_q to unqueue
*
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
* Returns:
* 1 - if the futex_q was still queued (and we removed unqueued it)
* 0 - if the futex_q was already removed by the waking thread
*/
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
spinlock_t *lock_ptr;
......@@ -1625,6 +1658,12 @@ out:
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout)
{
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using set_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE);
queue_me(q, hb);
......@@ -1742,6 +1781,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = NULL;
q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
......@@ -1855,6 +1895,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.pi_state = NULL;
q.rt_waiter = NULL;
q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
......@@ -2086,7 +2127,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
plist_del(&q->list, &q->list.plist);
/* Handle spurious wakeups gracefully */
ret = -EAGAIN;
ret = -EWOULDBLOCK;
if (timeout && !timeout->task)
ret = -ETIMEDOUT;
else if (signal_pending(current))
......@@ -2097,12 +2138,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initialyl wait on (non-pi)
* @uaddr: the futex we initially wait on (non-pi)
* @fshared: whether the futexes are shared (1) or not (0). They must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
* @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
......@@ -2116,11 +2157,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue and subsequent unlock
* 3) signal (before or after requeue)
* 4) timeout (before or after requeue)
* 2) wakeup on uaddr2 after a requeue
* 3) signal
* 4) timeout
*
* If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
* If 3, cleanup and return -ERESTARTNOINTR.
*
* If 2, we may then block on trying to take the rt_mutex and return via:
* 5) successful lock
......@@ -2128,7 +2169,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 7) timeout
* 8) other lock acquisition failure
*
* If 6, we setup a restart_block with futex_lock_pi() as the function.
* If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
......@@ -2167,16 +2208,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
retry:
key2 = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
......@@ -2229,7 +2270,7 @@ retry:
res = fixup_owner(uaddr2, fshared, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear our -ETIMEDOUT or -EINTR.
* acquired the lock, clear -ETIMEDOUT or -EINTR.
*/
if (res)
ret = (res < 0) ? res : 0;
......@@ -2247,14 +2288,11 @@ retry:
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
* We've already been requeued, but we have no way to
* restart by calling futex_lock_pi() directly. We
* could restart the syscall, but that will look at
* the user space value and return right away. So we
* drop back with EWOULDBLOCK to tell user space that
* "val" has been changed. That's the same what the
* restart of the syscall would do in
* futex_wait_setup().
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but
* it would detect that the user space "val" changed and return
* -EWOULDBLOCK. Save the overhead of the restart and return
* -EWOULDBLOCK directly.
*/
ret = -EWOULDBLOCK;
}
......@@ -2264,9 +2302,6 @@ out_put_keys:
out_key2:
put_futex_key(fshared, &key2);
/* Spurious wakeup ? */
if (ret == -EAGAIN)
goto retry;
out:
if (to) {
hrtimer_cancel(&to->timer);
......@@ -2291,9 +2326,9 @@ out:
*/
/**
* sys_set_robust_list - set the robust-futex list head of a task
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
* sys_set_robust_list() - Set the robust-futex list head of a task
* @head: pointer to the list-head
* @len: length of the list-head, as userspace expects
*/
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
size_t, len)
......@@ -2312,10 +2347,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
}
/**
* sys_get_robust_list - get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
* @len_ptr: pointer to a length field, the kernel fills in the header size
* sys_get_robust_list() - Get the robust-futex list head of a task
* @pid: pid of the process [zero for current task]
* @head_ptr: pointer to a list-head pointer, the kernel fills it in
* @len_ptr: pointer to a length field, the kernel fills in the header size
*/
SYSCALL_DEFINE3(get_robust_list, int, pid,
struct robust_list_head __user * __user *, head_ptr,
......
......@@ -137,30 +137,8 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
#ifdef CONFIG_SMP
static void double_rq_lock(struct rq *rq1, struct rq *rq2);
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
*/
static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
{
return reciprocal_divide(load, sg->reciprocal_cpu_power);
}
/*
* Each time a sched group cpu_power is changed,
* we must compute its reciprocal value
*/
static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
{
sg->__cpu_power += val;
sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
}
#endif
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
......@@ -673,6 +651,9 @@ struct rq {
struct task_struct *migration_thread;
struct list_head migration_queue;
u64 rt_avg;
u64 age_stamp;
#endif
/* calc_load related fields */
......@@ -926,6 +907,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
*/
unsigned int sysctl_sched_shares_thresh = 4;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
......@@ -1370,12 +1359,37 @@ void wake_up_idle_cpu(int cpu)
}
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
static void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
while ((s64)(rq->clock - rq->age_stamp) > period) {
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta;
sched_avg_update(rq);
}
#else /* !CONFIG_SMP */
static void resched_task(struct task_struct *p)
{
assert_atomic_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
......@@ -2365,8 +2379,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);
avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
......@@ -3713,7 +3726,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if (sgs->sum_nr_running > sgs->group_capacity - 1)
if (sgs->sum_nr_running + 1 > sgs->group_capacity)
return;
if (sgs->sum_nr_running > sds->leader_nr_running ||
......@@ -3781,6 +3794,94 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_LOAD_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
return default_scale_freq_power(sd, cpu);
}
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long smt_gain = sd->smt_gain;
smt_gain /= weight;
return smt_gain;
}
unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
return default_scale_smt_power(sd, cpu);
}
unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available;
sched_avg_update(rq);
total = sched_avg_period() + (rq->clock - rq->age_stamp);
available = total - rq->rt_avg;
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
total >>= SCHED_LOAD_SHIFT;
return div_u64(available, total);
}
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = cpumask_weight(sched_domain_span(sd));
unsigned long power = SCHED_LOAD_SCALE;
struct sched_group *sdg = sd->groups;
power *= arch_scale_freq_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
power *= arch_scale_smt_power(sd, cpu);
power >>= SCHED_LOAD_SHIFT;
}
power *= scale_rt_power(cpu);
power >>= SCHED_LOAD_SHIFT;
if (!power)
power = 1;
sdg->cpu_power = power;
}
static void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long power;
if (!child) {
update_cpu_power(sd, cpu);
return;
}
power = 0;
group = child->groups;
do {
power += group->cpu_power;
group = group->next;
} while (group != child->groups);
sdg->cpu_power = power;
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @group: sched_group whose statistics are to be updated.
......@@ -3793,7 +3894,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
enum cpu_idle_type idle, int load_idx, int *sd_idle,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
......@@ -3804,8 +3906,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
if (local_group)
if (local_group) {
balance_cpu = group_first_cpu(group);
if (balance_cpu == this_cpu)
update_group_power(sd, this_cpu);
}
/* Tally up the load of all CPUs in the group */
sum_avg_load_per_task = avg_load_per_task = 0;
......@@ -3854,8 +3959,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
}
/* Adjust by relative CPU power of the group */
sgs->avg_load = sg_div_cpu_power(group,
sgs->group_load * SCHED_LOAD_SCALE);
sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
/*
......@@ -3867,14 +3971,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task = sg_div_cpu_power(group,
sum_avg_load_per_task * SCHED_LOAD_SCALE);
avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
group->cpu_power;
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
sgs->group_imb = 1;
sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
sgs->group_capacity =
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
}
/**
......@@ -3892,9 +3996,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
const struct cpumask *cpus, int *balance,
struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *group = sd->groups;
struct sg_lb_stats sgs;
int load_idx;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
init_sd_power_savings_stats(sd, sds, idle);
load_idx = get_sd_load_idx(sd, idle);
......@@ -3905,14 +4013,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
local_group, cpus, balance, &sgs);
if (local_group && balance && !(*balance))
return;
sds->total_load += sgs.group_load;
sds->total_pwr += group->__cpu_power;
sds->total_pwr += group->cpu_power;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
* and move all the excess tasks away.
*/
if (prefer_sibling)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
sds->this_load = sgs.avg_load;
......@@ -3932,7 +4048,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
update_sd_power_savings_stats(group, sds, local_group, &sgs);
group = group->next;
} while (group != sd->groups);
}
/**
......@@ -3970,28 +4085,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
pwr_now += sds->busiest->__cpu_power *
pwr_now += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->__cpu_power *
pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = sg_div_cpu_power(sds->busiest,
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->busiest->cpu_power;
if (sds->max_load > tmp)
pwr_move += sds->busiest->__cpu_power *
pwr_move += sds->busiest->cpu_power *
min(sds->busiest_load_per_task, sds->max_load - tmp);
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->__cpu_power <
if (sds->max_load * sds->busiest->cpu_power <
sds->busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = sg_div_cpu_power(sds->this,
sds->max_load * sds->busiest->__cpu_power);
tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->this->cpu_power;
else
tmp = sg_div_cpu_power(sds->this,
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += sds->this->__cpu_power *
tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
sds->this->cpu_power;
pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
......@@ -4026,8 +4141,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
sds->max_load - sds->busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->__cpu_power,
(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
*imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
/ SCHED_LOAD_SCALE;
/*
......@@ -4145,6 +4260,26 @@ ret:
return NULL;
}
static struct sched_group *group_of(int cpu)
{
struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
if (!sd)
return NULL;
return sd->groups;
}
static unsigned long power_of(int cpu)
{
struct sched_group *group = group_of(cpu);
if (!group)
return SCHED_LOAD_SCALE;
return group->cpu_power;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
......@@ -4157,15 +4292,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
int i;
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
unsigned long wl;
if (!cpumask_test_cpu(i, cpus))
continue;
rq = cpu_rq(i);
wl = weighted_cpuload(i);
wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
wl /= power;
if (rq->nr_running == 1 && wl > imbalance)
if (capacity && rq->nr_running == 1 && wl > imbalance)
continue;
if (wl > max_load) {
......@@ -8076,7 +8214,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
if (!group->__cpu_power) {
if (!group->cpu_power) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
......@@ -8100,9 +8238,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
if (group->__cpu_power != SCHED_LOAD_SCALE) {
printk(KERN_CONT " (__cpu_power = %d)",
group->__cpu_power);
if (group->cpu_power != SCHED_LOAD_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->cpu_power);
}
group = group->next;
......@@ -8387,7 +8525,7 @@ init_sched_build_groups(const struct cpumask *span,
continue;
cpumask_clear(sched_group_cpus(sg));
sg->__cpu_power = 0;
sg->cpu_power = 0;
for_each_cpu(j, span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group)
......@@ -8612,7 +8750,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
continue;
}
sg_inc_cpu_power(sg, sd->groups->__cpu_power);
sg->cpu_power += sd->groups->cpu_power;
}
sg = sg->next;
} while (sg != group_head);
......@@ -8670,15 +8808,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
*
* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
* the maximum number of tasks a group can handle in the presence of other idle
* or lightly loaded groups in the same sched domain.
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
struct sched_domain *child;
struct sched_group *group;
long power;
int weight;
WARN_ON(!sd || !sd->groups);
......@@ -8687,28 +8823,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child;
sd->groups->__cpu_power = 0;
sd->groups->cpu_power = 0;
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
* or SMT), then set this domain groups cpu_power such that each group
* can handle only one task, when there are other idle groups in the
* same sched domain.
*/
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
if (!child) {
power = SCHED_LOAD_SCALE;
weight = cpumask_weight(sched_domain_span(sd));
/*
* SMT siblings share the power of a single core.
* Usually multiple threads get a better yield out of
* that one core than a single thread would have,
* reflect that in sd->smt_gain.
*/
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
power *= sd->smt_gain;
power /= weight;
power >>= SCHED_LOAD_SHIFT;
}
sd->groups->cpu_power += power;
return;
}
/*
* add cpu_power of each child group to this groups cpu_power
* Add cpu_power of each child group to this groups cpu_power.
*/
group = child->groups;
do {
sg_inc_cpu_power(sd->groups, group->__cpu_power);
sd->groups->cpu_power += group->cpu_power;
group = group->next;
} while (group != child->groups);
}
......@@ -8981,7 +9121,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd = &per_cpu(node_domains, j).sd;
sd->groups = sg;
}
sg->__cpu_power = 0;
sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), nodemask);
sg->next = sg;
cpumask_or(covered, covered, nodemask);
......@@ -9008,7 +9148,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
"Can not alloc domain group for node %d\n", j);
goto error;
}
sg->__cpu_power = 0;
sg->cpu_power = 0;
cpumask_copy(sched_group_cpus(sg), tmpmask);
sg->next = prev->next;
cpumask_or(covered, covered, tmpmask);
......
......@@ -1040,39 +1040,58 @@ static void yield_task_fair(struct rq *rq)
se->vruntime = rightmost->vruntime + 1;
}
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*
* XXX - can generate significant overload on perferred_wakeup_cpu
* with plenty of idle cpus, leading to a significant loss in
* throughput.
*
* Returns: < 0 - no placement decision made
* >= 0 - place on cpu
*/
static int wake_idle_power_save(int cpu, struct task_struct *p)
{
int this_cpu = smp_processor_id();
int wakeup_cpu;
if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
return -1;
if (!idle_cpu(cpu) || !idle_cpu(this_cpu))
return -1;
if (!p->mm || (p->flags & PF_KTHREAD))
return -1;
wakeup_cpu = cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
if (!cpu_isset(wakeup_cpu, p->cpus_allowed))
return -1;
return wakeup_cpu;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_mask)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
struct sched_domain *sd;
struct rq *task_rq = task_rq(p);
struct sched_domain *sd, *child = NULL;
int i;
unsigned int chosen_wakeup_cpu;
int this_cpu;
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*/
this_cpu = smp_processor_id();
chosen_wakeup_cpu =
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
idle_cpu(cpu) && idle_cpu(this_cpu) &&
p->mm && !(p->flags & PF_KTHREAD) &&
cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
return chosen_wakeup_cpu;
i = wake_idle_power_save(cpu, p);
if (i >= 0)
return i;
/*
* If it is idle, then it is the best cpu to run this task.
......@@ -1081,29 +1100,39 @@ static int wake_idle(int cpu, struct task_struct *p)
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
* penalties associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) {
if (cpu_active(i) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
break;
if (!(sd->flags & SD_WAKE_IDLE) &&
(task_hot(p, task_rq->clock, sd) || !(sd->flags & SD_WAKE_IDLE_FAR)))
break;
}
}
for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
if (child && cpumask_test_cpu(i, sched_domain_span(child)))
continue;
if (!idle_cpu(i))
continue;
if (task_cpu(p) != i)
schedstat_inc(p, se.nr_wakeups_idle);
cpu = i;
goto unlock;
}
child = sd;
}
unlock:
rcu_read_unlock();
return cpu;
}
#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
......@@ -1235,7 +1264,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(p);
weight = p->se.load.weight;
balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced = !tl ||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
......
......@@ -602,6 +602,8 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
......@@ -926,8 +928,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
......@@ -942,8 +942,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
dec_cpu_load(rq, p->se.load.weight);
}
/*
......
......@@ -330,6 +330,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_time_avg",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "timer_migration",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment