Commit 346bf891 authored by Thomas Gleixner's avatar Thomas Gleixner

Merge branch 'rt/head' into rt/2.6.31

parents 5440ba2c 465a3c40
......@@ -95,6 +95,7 @@
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_APERFMPERF (3*32+27) /* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
......
......@@ -27,6 +27,7 @@ struct mm_struct;
#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/init.h>
/*
......@@ -1010,4 +1011,33 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
struct aperfmperf {
u64 aperf, mperf;
};
static inline void get_aperfmperf(struct aperfmperf *am)
{
WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
rdmsrl(MSR_IA32_APERF, am->aperf);
rdmsrl(MSR_IA32_MPERF, am->mperf);
}
#define APERFMPERF_SHIFT 10
static inline
unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
struct aperfmperf *new)
{
u64 aperf = new->aperf - old->aperf;
u64 mperf = new->mperf - old->mperf;
unsigned long ratio = aperf;
mperf >>= APERFMPERF_SHIFT;
if (mperf)
ratio = div64_u64(aperf, mperf);
return ratio;
}
#endif /* _ASM_X86_PROCESSOR_H */
......@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o capflags.o powerflags.o common.o
obj-y += vmware.o hypervisor.o
obj-y += vmware.o hypervisor.o sched.o
obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
obj-$(CONFIG_X86_64) += bugs_64.o
......
......@@ -60,7 +60,6 @@ enum {
};
#define INTEL_MSR_RANGE (0xffff)
#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
struct acpi_cpufreq_data {
struct acpi_processor_performance *acpi_data;
......@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
struct acpi_msr_data {
u64 saved_aperf, saved_mperf;
};
static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
static DEFINE_PER_CPU(struct aperfmperf, old_perf);
DEFINE_TRACE(power_mark);
......@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
struct perf_pair {
union {
struct {
u32 lo;
u32 hi;
} split;
u64 whole;
} aperf, mperf;
};
/* Called via smp_call_function_single(), on the target CPU */
static void read_measured_perf_ctrs(void *_cur)
{
struct perf_pair *cur = _cur;
struct aperfmperf *am = _cur;
rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi);
rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
get_aperfmperf(am);
}
/*
......@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
static unsigned int get_measured_perf(struct cpufreq_policy *policy,
unsigned int cpu)
{
struct perf_pair readin, cur;
unsigned int perf_percent;
struct aperfmperf perf;
unsigned long ratio;
unsigned int retval;
if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1))
if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
return 0;
cur.aperf.whole = readin.aperf.whole -
per_cpu(msr_data, cpu).saved_aperf;
cur.mperf.whole = readin.mperf.whole -
per_cpu(msr_data, cpu).saved_mperf;
per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
#ifdef __i386__
/*
* We dont want to do 64 bit divide with 32 bit kernel
* Get an approximate value. Return failure in case we cannot get
* an approximate value.
*/
if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
int shift_count;
u32 h;
h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
shift_count = fls(h);
cur.aperf.whole >>= shift_count;
cur.mperf.whole >>= shift_count;
}
if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
int shift_count = 7;
cur.aperf.split.lo >>= shift_count;
cur.mperf.split.lo >>= shift_count;
}
if (cur.aperf.split.lo && cur.mperf.split.lo)
perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
else
perf_percent = 0;
ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
per_cpu(old_perf, cpu) = perf;
#else
if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
int shift_count = 7;
cur.aperf.whole >>= shift_count;
cur.mperf.whole >>= shift_count;
}
if (cur.aperf.whole && cur.mperf.whole)
perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
else
perf_percent = 0;
#endif
retval = (policy->cpuinfo.max_freq * perf_percent) / 100;
retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
return retval;
}
......@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
acpi_processor_notify_smm(THIS_MODULE);
/* Check for APERF/MPERF support in hardware */
if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
unsigned int ecx;
ecx = cpuid_ecx(6);
if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
acpi_cpufreq_driver.getavg = get_measured_perf;
}
if (cpu_has(c, X86_FEATURE_APERFMPERF))
acpi_cpufreq_driver.getavg = get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
......
......@@ -349,6 +349,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
if (c->cpuid_level > 6) {
unsigned ecx = cpuid_ecx(6);
if (ecx & 0x01)
set_cpu_cap(c, X86_FEATURE_APERFMPERF);
}
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
......
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/irqflags.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
static DEFINE_PER_CPU(struct aperfmperf, old_aperfmperf);
static unsigned long scale_aperfmperf(void)
{
struct aperfmperf cur, val, *old = &__get_cpu_var(old_aperfmperf);
unsigned long ratio = SCHED_LOAD_SCALE;
unsigned long flags;
local_irq_save(flags);
get_aperfmperf(&val);
local_irq_restore(flags);
cur = val;
cur.aperf -= old->aperf;
cur.mperf -= old->mperf;
*old = val;
cur.mperf >>= SCHED_LOAD_SHIFT;
if (cur.mperf)
ratio = div_u64(cur.aperf, cur.mperf);
return ratio;
}
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
{
/*
* do aperf/mperf on the cpu level because it includes things
* like turbo mode, which are relevant to full cores.
*/
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
return scale_aperfmperf();
/*
* maybe have something cpufreq here
*/
return default_scale_freq_power(sd, cpu);
}
unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
{
/*
* aperf/mperf already includes the smt gain
*/
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
return SCHED_LOAD_SCALE;
return default_scale_smt_power(sd, cpu);
}
......@@ -843,18 +843,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 4 /* Balance on exec */
#define SD_BALANCE_FORK 8 /* Balance on fork, clone */
#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
......@@ -874,7 +875,7 @@ static inline int sd_balance_for_mc_power(void)
if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
return 0;
return SD_PREFER_SIBLING;
}
static inline int sd_balance_for_package_power(void)
......@@ -882,7 +883,7 @@ static inline int sd_balance_for_package_power(void)
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
return 0;
return SD_PREFER_SIBLING;
}
/*
......@@ -904,15 +905,9 @@ struct sched_group {
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU).
* Note : Never change cpu_power without recompute its reciprocal
*/
unsigned int __cpu_power;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
* single CPU.
*/
u32 reciprocal_cpu_power;
unsigned int cpu_power;
/*
* The CPUs this group covers.
......@@ -965,6 +960,7 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */
enum sched_domain_level level;
......@@ -1051,6 +1047,10 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
}
#endif /* !CONFIG_SMP */
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
struct io_context; /* See blkdev.h */
......@@ -1913,6 +1913,7 @@ extern unsigned int sysctl_sched_child_runs_first;
extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
int sched_nr_latency_handler(struct ctl_table *table, int write,
......
......@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| SD_SHARE_CPUPOWER, \
.last_balance = jiffies, \
.balance_interval = 1, \
.smt_gain = 1178, /* 15% */ \
}
#endif
#endif /* CONFIG_SCHED_SMT */
......
This diff is collapsed.
This diff is collapsed.
......@@ -1040,39 +1040,58 @@ static void yield_task_fair(struct rq *rq)
se->vruntime = rightmost->vruntime + 1;
}
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*
* XXX - can generate significant overload on perferred_wakeup_cpu
* with plenty of idle cpus, leading to a significant loss in
* throughput.
*
* Returns: < 0 - no placement decision made
* >= 0 - place on cpu
*/
static int wake_idle_power_save(int cpu, struct task_struct *p)
{
int this_cpu = smp_processor_id();
int wakeup_cpu;
if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
return -1;
if (!idle_cpu(cpu) || !idle_cpu(this_cpu))
return -1;
if (!p->mm || (p->flags & PF_KTHREAD))
return -1;
wakeup_cpu = cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
if (!cpu_isset(wakeup_cpu, p->cpus_allowed))
return -1;
return wakeup_cpu;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_mask)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
struct sched_domain *sd;
struct rq *task_rq = task_rq(p);
struct sched_domain *sd, *child = NULL;
int i;
unsigned int chosen_wakeup_cpu;
int this_cpu;
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*/
this_cpu = smp_processor_id();
chosen_wakeup_cpu =
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
idle_cpu(cpu) && idle_cpu(this_cpu) &&
p->mm && !(p->flags & PF_KTHREAD) &&
cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
return chosen_wakeup_cpu;
i = wake_idle_power_save(cpu, p);
if (i >= 0)
return i;
/*
* If it is idle, then it is the best cpu to run this task.
......@@ -1081,29 +1100,39 @@ static int wake_idle(int cpu, struct task_struct *p)
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
* penalties associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) {
for_each_cpu_and(i, sched_domain_span(sd),
&p->cpus_allowed) {
if (cpu_active(i) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
}
return i;
}
}
} else {
rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
break;
if (!(sd->flags & SD_WAKE_IDLE) &&
(task_hot(p, task_rq->clock, sd) || !(sd->flags & SD_WAKE_IDLE_FAR)))
break;
}
}
for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
if (child && cpumask_test_cpu(i, sched_domain_span(child)))
continue;
if (!idle_cpu(i))
continue;
if (task_cpu(p) != i)
schedstat_inc(p, se.nr_wakeups_idle);
cpu = i;
goto unlock;
}
child = sd;
}
unlock:
rcu_read_unlock();
return cpu;
}
#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
......@@ -1235,7 +1264,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg = task_group(p);
weight = p->se.load.weight;
balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced = !tl ||
100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
......
......@@ -602,6 +602,8 @@ static void update_curr_rt(struct rq *rq)
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
......@@ -926,8 +928,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
......@@ -942,8 +942,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
dec_cpu_load(rq, p->se.load.weight);
}
/*
......
......@@ -330,6 +330,14 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_time_avg",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "timer_migration",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment