Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
L
linux-davinci
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Redmine
Redmine
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Metrics
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
linux
linux-davinci
Commits
346bf891
Commit
346bf891
authored
Oct 29, 2009
by
Thomas Gleixner
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'rt/head' into rt/2.6.31
parents
5440ba2c
465a3c40
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
546 additions
and
295 deletions
+546
-295
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/cpufeature.h
+1
-0
arch/x86/include/asm/processor.h
arch/x86/include/asm/processor.h
+30
-0
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/Makefile
+1
-1
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+11
-77
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel.c
+6
-0
arch/x86/kernel/cpu/sched.c
arch/x86/kernel/cpu/sched.c
+58
-0
include/linux/sched.h
include/linux/sched.h
+23
-22
include/linux/topology.h
include/linux/topology.h
+1
-0
kernel/futex.c
kernel/futex.c
+108
-73
kernel/sched.c
kernel/sched.c
+218
-78
kernel/sched_fair.c
kernel/sched_fair.c
+79
-40
kernel/sched_rt.c
kernel/sched_rt.c
+2
-4
kernel/sysctl.c
kernel/sysctl.c
+8
-0
No files found.
arch/x86/include/asm/cpufeature.h
View file @
346bf891
...
...
@@ -95,6 +95,7 @@
#define X86_FEATURE_NONSTOP_TSC (3*32+24)
/* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25)
/* "" clflush reqd with monitor */
#define X86_FEATURE_EXTD_APICID (3*32+26)
/* has extended APICID (8 bits) */
#define X86_FEATURE_APERFMPERF (3*32+27)
/* APERFMPERF */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0)
/* "pni" SSE-3 */
...
...
arch/x86/include/asm/processor.h
View file @
346bf891
...
...
@@ -27,6 +27,7 @@ struct mm_struct;
#include <linux/cpumask.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/math64.h>
#include <linux/init.h>
/*
...
...
@@ -1010,4 +1011,33 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern
int
get_tsc_mode
(
unsigned
long
adr
);
extern
int
set_tsc_mode
(
unsigned
int
val
);
struct
aperfmperf
{
u64
aperf
,
mperf
;
};
static
inline
void
get_aperfmperf
(
struct
aperfmperf
*
am
)
{
WARN_ON_ONCE
(
!
boot_cpu_has
(
X86_FEATURE_APERFMPERF
));
rdmsrl
(
MSR_IA32_APERF
,
am
->
aperf
);
rdmsrl
(
MSR_IA32_MPERF
,
am
->
mperf
);
}
#define APERFMPERF_SHIFT 10
static
inline
unsigned
long
calc_aperfmperf_ratio
(
struct
aperfmperf
*
old
,
struct
aperfmperf
*
new
)
{
u64
aperf
=
new
->
aperf
-
old
->
aperf
;
u64
mperf
=
new
->
mperf
-
old
->
mperf
;
unsigned
long
ratio
=
aperf
;
mperf
>>=
APERFMPERF_SHIFT
;
if
(
mperf
)
ratio
=
div64_u64
(
aperf
,
mperf
);
return
ratio
;
}
#endif
/* _ASM_X86_PROCESSOR_H */
arch/x86/kernel/cpu/Makefile
View file @
346bf891
...
...
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
obj-y
:=
intel_cacheinfo.o addon_cpuid_features.o
obj-y
+=
proc.o capflags.o powerflags.o common.o
obj-y
+=
vmware.o hypervisor.o
obj-y
+=
vmware.o hypervisor.o
sched.o
obj-$(CONFIG_X86_32)
+=
bugs.o cmpxchg.o
obj-$(CONFIG_X86_64)
+=
bugs_64.o
...
...
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
View file @
346bf891
...
...
@@ -60,7 +60,6 @@ enum {
};
#define INTEL_MSR_RANGE (0xffff)
#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
struct
acpi_cpufreq_data
{
struct
acpi_processor_performance
*
acpi_data
;
...
...
@@ -71,11 +70,7 @@ struct acpi_cpufreq_data {
static
DEFINE_PER_CPU
(
struct
acpi_cpufreq_data
*
,
drv_data
);
struct
acpi_msr_data
{
u64
saved_aperf
,
saved_mperf
;
};
static
DEFINE_PER_CPU
(
struct
acpi_msr_data
,
msr_data
);
static
DEFINE_PER_CPU
(
struct
aperfmperf
,
old_perf
);
DEFINE_TRACE
(
power_mark
);
...
...
@@ -244,23 +239,12 @@ static u32 get_cur_val(const struct cpumask *mask)
return
cmd
.
val
;
}
struct
perf_pair
{
union
{
struct
{
u32
lo
;
u32
hi
;
}
split
;
u64
whole
;
}
aperf
,
mperf
;
};
/* Called via smp_call_function_single(), on the target CPU */
static
void
read_measured_perf_ctrs
(
void
*
_cur
)
{
struct
perf_pair
*
cur
=
_cur
;
struct
aperfmperf
*
am
=
_cur
;
rdmsr
(
MSR_IA32_APERF
,
cur
->
aperf
.
split
.
lo
,
cur
->
aperf
.
split
.
hi
);
rdmsr
(
MSR_IA32_MPERF
,
cur
->
mperf
.
split
.
lo
,
cur
->
mperf
.
split
.
hi
);
get_aperfmperf
(
am
);
}
/*
...
...
@@ -279,63 +263,17 @@ static void read_measured_perf_ctrs(void *_cur)
static
unsigned
int
get_measured_perf
(
struct
cpufreq_policy
*
policy
,
unsigned
int
cpu
)
{
struct
perf_pair
readin
,
cur
;
unsigned
int
perf_percent
;
struct
aperfmperf
perf
;
unsigned
long
ratio
;
unsigned
int
retval
;
if
(
smp_call_function_single
(
cpu
,
read_measured_perf_ctrs
,
&
readin
,
1
))
if
(
smp_call_function_single
(
cpu
,
read_measured_perf_ctrs
,
&
perf
,
1
))
return
0
;
cur
.
aperf
.
whole
=
readin
.
aperf
.
whole
-
per_cpu
(
msr_data
,
cpu
).
saved_aperf
;
cur
.
mperf
.
whole
=
readin
.
mperf
.
whole
-
per_cpu
(
msr_data
,
cpu
).
saved_mperf
;
per_cpu
(
msr_data
,
cpu
).
saved_aperf
=
readin
.
aperf
.
whole
;
per_cpu
(
msr_data
,
cpu
).
saved_mperf
=
readin
.
mperf
.
whole
;
#ifdef __i386__
/*
* We dont want to do 64 bit divide with 32 bit kernel
* Get an approximate value. Return failure in case we cannot get
* an approximate value.
*/
if
(
unlikely
(
cur
.
aperf
.
split
.
hi
||
cur
.
mperf
.
split
.
hi
))
{
int
shift_count
;
u32
h
;
h
=
max_t
(
u32
,
cur
.
aperf
.
split
.
hi
,
cur
.
mperf
.
split
.
hi
);
shift_count
=
fls
(
h
);
cur
.
aperf
.
whole
>>=
shift_count
;
cur
.
mperf
.
whole
>>=
shift_count
;
}
if
(((
unsigned
long
)(
-
1
)
/
100
)
<
cur
.
aperf
.
split
.
lo
)
{
int
shift_count
=
7
;
cur
.
aperf
.
split
.
lo
>>=
shift_count
;
cur
.
mperf
.
split
.
lo
>>=
shift_count
;
}
if
(
cur
.
aperf
.
split
.
lo
&&
cur
.
mperf
.
split
.
lo
)
perf_percent
=
(
cur
.
aperf
.
split
.
lo
*
100
)
/
cur
.
mperf
.
split
.
lo
;
else
perf_percent
=
0
;
ratio
=
calc_aperfmperf_ratio
(
&
per_cpu
(
old_perf
,
cpu
),
&
perf
);
per_cpu
(
old_perf
,
cpu
)
=
perf
;
#else
if
(
unlikely
(((
unsigned
long
)(
-
1
)
/
100
)
<
cur
.
aperf
.
whole
))
{
int
shift_count
=
7
;
cur
.
aperf
.
whole
>>=
shift_count
;
cur
.
mperf
.
whole
>>=
shift_count
;
}
if
(
cur
.
aperf
.
whole
&&
cur
.
mperf
.
whole
)
perf_percent
=
(
cur
.
aperf
.
whole
*
100
)
/
cur
.
mperf
.
whole
;
else
perf_percent
=
0
;
#endif
retval
=
(
policy
->
cpuinfo
.
max_freq
*
perf_percent
)
/
100
;
retval
=
(
policy
->
cpuinfo
.
max_freq
*
ratio
)
>>
APERFMPERF_SHIFT
;
return
retval
;
}
...
...
@@ -731,12 +669,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
acpi_processor_notify_smm
(
THIS_MODULE
);
/* Check for APERF/MPERF support in hardware */
if
(
c
->
x86_vendor
==
X86_VENDOR_INTEL
&&
c
->
cpuid_level
>=
6
)
{
unsigned
int
ecx
;
ecx
=
cpuid_ecx
(
6
);
if
(
ecx
&
CPUID_6_ECX_APERFMPERF_CAPABILITY
)
acpi_cpufreq_driver
.
getavg
=
get_measured_perf
;
}
if
(
cpu_has
(
c
,
X86_FEATURE_APERFMPERF
))
acpi_cpufreq_driver
.
getavg
=
get_measured_perf
;
dprintk
(
"CPU%u - ACPI performance management activated.
\n
"
,
cpu
);
for
(
i
=
0
;
i
<
perf
->
state_count
;
i
++
)
...
...
arch/x86/kernel/cpu/intel.c
View file @
346bf891
...
...
@@ -349,6 +349,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap
(
c
,
X86_FEATURE_ARCH_PERFMON
);
}
if
(
c
->
cpuid_level
>
6
)
{
unsigned
ecx
=
cpuid_ecx
(
6
);
if
(
ecx
&
0x01
)
set_cpu_cap
(
c
,
X86_FEATURE_APERFMPERF
);
}
if
(
cpu_has_xmm2
)
set_cpu_cap
(
c
,
X86_FEATURE_LFENCE_RDTSC
);
if
(
cpu_has_ds
)
{
...
...
arch/x86/kernel/cpu/sched.c
0 → 100644
View file @
346bf891
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/irqflags.h>
#include <asm/cpufeature.h>
#include <asm/processor.h>
static
DEFINE_PER_CPU
(
struct
aperfmperf
,
old_aperfmperf
);
static
unsigned
long
scale_aperfmperf
(
void
)
{
struct
aperfmperf
cur
,
val
,
*
old
=
&
__get_cpu_var
(
old_aperfmperf
);
unsigned
long
ratio
=
SCHED_LOAD_SCALE
;
unsigned
long
flags
;
local_irq_save
(
flags
);
get_aperfmperf
(
&
val
);
local_irq_restore
(
flags
);
cur
=
val
;
cur
.
aperf
-=
old
->
aperf
;
cur
.
mperf
-=
old
->
mperf
;
*
old
=
val
;
cur
.
mperf
>>=
SCHED_LOAD_SHIFT
;
if
(
cur
.
mperf
)
ratio
=
div_u64
(
cur
.
aperf
,
cur
.
mperf
);
return
ratio
;
}
unsigned
long
arch_scale_freq_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
/*
* do aperf/mperf on the cpu level because it includes things
* like turbo mode, which are relevant to full cores.
*/
if
(
boot_cpu_has
(
X86_FEATURE_APERFMPERF
))
return
scale_aperfmperf
();
/*
* maybe have something cpufreq here
*/
return
default_scale_freq_power
(
sd
,
cpu
);
}
unsigned
long
arch_scale_smt_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
/*
* aperf/mperf already includes the smt gain
*/
if
(
boot_cpu_has
(
X86_FEATURE_APERFMPERF
))
return
SCHED_LOAD_SCALE
;
return
default_scale_smt_power
(
sd
,
cpu
);
}
include/linux/sched.h
View file @
346bf891
...
...
@@ -843,18 +843,19 @@ enum cpu_idle_type {
#define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE
#ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 1
/* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 2
/* Balance when about to become idle */
#define SD_BALANCE_EXEC 4
/* Balance on exec */
#define SD_BALANCE_FORK 8
/* Balance on fork, clone */
#define SD_WAKE_IDLE 16
/* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 32
/* Wake task to waking CPU */
#define SD_WAKE_BALANCE 64
/* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 128
/* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 256
/* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 512
/* Domain members share cpu pkg resources */
#define SD_SERIALIZE 1024
/* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048
/* Gain latency sacrificing cache hit */
#define SD_LOAD_BALANCE 0x0001
/* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 0x0002
/* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0004
/* Balance on exec */
#define SD_BALANCE_FORK 0x0008
/* Balance on fork, clone */
#define SD_WAKE_IDLE 0x0010
/* Wake to idle CPU on task wakeup */
#define SD_WAKE_AFFINE 0x0020
/* Wake task to waking CPU */
#define SD_WAKE_BALANCE 0x0040
/* Perform balancing at task wakeup */
#define SD_SHARE_CPUPOWER 0x0080
/* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 0x0100
/* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 0x0200
/* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400
/* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 0x0800
/* Gain latency sacrificing cache hit */
#define SD_PREFER_SIBLING 0x1000
/* Prefer to place tasks in a sibling domain */
enum
powersavings_balance_level
{
POWERSAVINGS_BALANCE_NONE
=
0
,
/* No power saving load balance */
...
...
@@ -874,7 +875,7 @@ static inline int sd_balance_for_mc_power(void)
if
(
sched_smt_power_savings
)
return
SD_POWERSAVINGS_BALANCE
;
return
0
;
return
SD_PREFER_SIBLING
;
}
static
inline
int
sd_balance_for_package_power
(
void
)
...
...
@@ -882,7 +883,7 @@ static inline int sd_balance_for_package_power(void)
if
(
sched_mc_power_savings
|
sched_smt_power_savings
)
return
SD_POWERSAVINGS_BALANCE
;
return
0
;
return
SD_PREFER_SIBLING
;
}
/*
...
...
@@ -904,15 +905,9 @@ struct sched_group {
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This is read only (except for setup, hotplug CPU).
* Note : Never change cpu_power without recompute its reciprocal
*/
unsigned
int
__cpu_power
;
/*
* reciprocal value of cpu_power to avoid expensive divides
* (see include/linux/reciprocal_div.h)
* single CPU.
*/
u
32
reciprocal_
cpu_power
;
u
nsigned
int
cpu_power
;
/*
* The CPUs this group covers.
...
...
@@ -965,6 +960,7 @@ struct sched_domain {
unsigned
int
newidle_idx
;
unsigned
int
wake_idx
;
unsigned
int
forkexec_idx
;
unsigned
int
smt_gain
;
int
flags
;
/* See SD_* */
enum
sched_domain_level
level
;
...
...
@@ -1051,6 +1047,10 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
}
#endif
/* !CONFIG_SMP */
unsigned
long
default_scale_freq_power
(
struct
sched_domain
*
sd
,
int
cpu
);
unsigned
long
default_scale_smt_power
(
struct
sched_domain
*
sd
,
int
cpu
);
struct
io_context
;
/* See blkdev.h */
...
...
@@ -1913,6 +1913,7 @@ extern unsigned int sysctl_sched_child_runs_first;
extern
unsigned
int
sysctl_sched_features
;
extern
unsigned
int
sysctl_sched_migration_cost
;
extern
unsigned
int
sysctl_sched_nr_migrate
;
extern
unsigned
int
sysctl_sched_time_avg
;
extern
unsigned
int
sysctl_timer_migration
;
int
sched_nr_latency_handler
(
struct
ctl_table
*
table
,
int
write
,
...
...
include/linux/topology.h
View file @
346bf891
...
...
@@ -99,6 +99,7 @@ int arch_update_cpu_topology(void);
| SD_SHARE_CPUPOWER, \
.last_balance = jiffies, \
.balance_interval = 1, \
.smt_gain = 1178,
/* 15% */
\
}
#endif
#endif
/* CONFIG_SCHED_SMT */
...
...
kernel/futex.c
View file @
346bf891
...
...
@@ -89,33 +89,36 @@ struct futex_pi_state {
union
futex_key
key
;
};
/*
* We use this hashed waitqueue instead of a normal wait_queue_t, so
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
* @task: the task waiting on the futex
* @lock_ptr: the hash bucket lock
* @key: the key the futex is hashed on
* @pi_state: optional priority inheritance state
* @rt_waiter: rt_waiter storage for use with requeue_pi
* @requeue_pi_key: the requeue_pi target futex key
* @bitset: bitset for the optional bitmasked wakeup
*
* We use this hashed waitqueue, instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared).
*
* A futex_q has a woken state, just like tasks have TASK_RUNNING.
* It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
* The order of wakup is always to make the first condition true, then
* wake up q->waiter, then make the second condition true.
* the second.
*
* PI futexes are typically woken before they are removed from the hash list via
* the rt_mutex code. See unqueue_me_pi().
*/
struct
futex_q
{
struct
plist_node
list
;
/* Waiter reference */
struct
task_struct
*
task
;
/* Which hash list lock to use: */
struct
task_struct
*
task
;
spinlock_t
*
lock_ptr
;
/* Key which the futex is hashed on: */
union
futex_key
key
;
/* Optional priority inheritance state: */
struct
futex_pi_state
*
pi_state
;
/* rt_waiter storage for requeue_pi: */
struct
rt_mutex_waiter
*
rt_waiter
;
/* Bitset for the optional bitmasked wakeup */
union
futex_key
*
requeue_pi_key
;
u32
bitset
;
};
...
...
@@ -147,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
*/
static
inline
int
match_futex
(
union
futex_key
*
key1
,
union
futex_key
*
key2
)
{
return
(
key1
->
both
.
word
==
key2
->
both
.
word
return
(
key1
&&
key2
&&
key1
->
both
.
word
==
key2
->
both
.
word
&&
key1
->
both
.
ptr
==
key2
->
both
.
ptr
&&
key1
->
both
.
offset
==
key2
->
both
.
offset
);
}
...
...
@@ -195,11 +199,12 @@ static void drop_futex_key_refs(union futex_key *key)
}
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
...
...
@@ -285,8 +290,8 @@ void put_futex_key(int fshared, union futex_key *key)
drop_futex_key_refs
(
key
);
}
/*
* fault_in_user_writeable
- f
ault in user address and verify RW access
/*
*
* fault_in_user_writeable
() - F
ault in user address and verify RW access
* @uaddr: pointer to faulting user space address
*
* Slow path to fixup the fault we just took in the atomic write
...
...
@@ -306,8 +311,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb:
the hash bucket the futex_q's reside in
* @key:
the futex key (to distinguish it from other futex futex_q's)
* @hb:
the hash bucket the futex_q's reside in
* @key:
the futex key (to distinguish it from other futex futex_q's)
*
* Must be called with the hb lock held.
*/
...
...
@@ -585,7 +590,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
}
/**
* futex_lock_pi_atomic() -
a
tomic work required to acquire a pi aware futex
* futex_lock_pi_atomic() -
A
tomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
* @hb: the pi futex hash bucket
* @key: the futex key associated with uaddr and hb
...
...
@@ -1008,9 +1013,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
/**
* requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
*
q:
the futex_q
* key: the key of the requeue target futex
*
hb:
the hash_bucket of the requeue target futex
*
@q:
the futex_q
*
@
key: the key of the requeue target futex
*
@hb:
the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
* target futex if it is uncontended or via a lock steal. Set the futex_q key
...
...
@@ -1024,7 +1029,6 @@ static inline
void
requeue_pi_wake_futex
(
struct
futex_q
*
q
,
union
futex_key
*
key
,
struct
futex_hash_bucket
*
hb
)
{
drop_futex_key_refs
(
&
q
->
key
);
get_futex_key_refs
(
key
);
q
->
key
=
*
key
;
...
...
@@ -1089,6 +1093,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if
(
!
top_waiter
)
return
0
;
/* Ensure we requeue to the expected futex. */
if
(
!
match_futex
(
top_waiter
->
requeue_pi_key
,
key2
))
return
-
EINVAL
;
/*
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
* the contended case or if set_waiters is 1. The pi_state is returned
...
...
@@ -1218,6 +1226,7 @@ retry_private:
*/
if
(
ret
==
1
)
{
WARN_ON
(
pi_state
);
drop_count
++
;
task_count
++
;
ret
=
get_futex_value_locked
(
&
curval2
,
uaddr2
);
if
(
!
ret
)
...
...
@@ -1276,6 +1285,12 @@ retry_private:
continue
;
}
/* Ensure we requeue to the expected futex for requeue_pi. */
if
(
requeue_pi
&&
!
match_futex
(
this
->
requeue_pi_key
,
&
key2
))
{
ret
=
-
EINVAL
;
break
;
}
/*
* Requeue nr_requeue waiters and possibly one more in the case
* of requeue_pi if we couldn't acquire the lock atomically.
...
...
@@ -1290,6 +1305,7 @@ retry_private:
if
(
ret
==
1
)
{
/* We got the lock. */
requeue_pi_wake_futex
(
this
,
&
key2
,
hb2
);
drop_count
++
;
continue
;
}
else
if
(
ret
)
{
/* -EDEADLK */
...
...
@@ -1337,6 +1353,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
return
hb
;
}
static
inline
void
queue_unlock
(
struct
futex_q
*
q
,
struct
futex_hash_bucket
*
hb
)
{
spin_unlock
(
&
hb
->
lock
);
drop_futex_key_refs
(
&
q
->
key
);
}
/**
* queue_me() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
*
* The hb->lock must be held by the caller, and is released here. A call to
* queue_me() is typically paired with exactly one call to unqueue_me(). The
* exceptions involve the PI related operations, which may use unqueue_me_pi()
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
*/
static
inline
void
queue_me
(
struct
futex_q
*
q
,
struct
futex_hash_bucket
*
hb
)
{
int
prio
;
...
...
@@ -1360,19 +1395,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
spin_unlock
(
&
hb
->
lock
);
}
static
inline
void
queue_unlock
(
struct
futex_q
*
q
,
struct
futex_hash_bucket
*
hb
)
{
spin_unlock
(
&
hb
->
lock
);
drop_futex_key_refs
(
&
q
->
key
);
}
/*
*
queue_me and unqueue_me must be called as a pair, each
*
exactly once. They are called with the hashed spinlock held.
/**
* unqueue_me() - Remove the futex_q from its futex_hash_bucket
* @q: The futex_q to unqueue
*
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
* Returns:
*
1 - if the futex_q was still queued (and we removed unqueued it)
*
0 - if the futex_q was already removed by the waking thread
*/
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static
int
unqueue_me
(
struct
futex_q
*
q
)
{
spinlock_t
*
lock_ptr
;
...
...
@@ -1625,6 +1658,12 @@ out:
static
void
futex_wait_queue_me
(
struct
futex_hash_bucket
*
hb
,
struct
futex_q
*
q
,
struct
hrtimer_sleeper
*
timeout
)
{
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using set_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
set_current_state
(
TASK_INTERRUPTIBLE
);
queue_me
(
q
,
hb
);
...
...
@@ -1742,6 +1781,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
q
.
pi_state
=
NULL
;
q
.
bitset
=
bitset
;
q
.
rt_waiter
=
NULL
;
q
.
requeue_pi_key
=
NULL
;
if
(
abs_time
)
{
to
=
&
timeout
;
...
...
@@ -1855,6 +1895,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q
.
pi_state
=
NULL
;
q
.
rt_waiter
=
NULL
;
q
.
requeue_pi_key
=
NULL
;
retry:
q
.
key
=
FUTEX_KEY_INIT
;
ret
=
get_futex_key
(
uaddr
,
fshared
,
&
q
.
key
,
VERIFY_WRITE
);
...
...
@@ -2086,7 +2127,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
plist_del
(
&
q
->
list
,
&
q
->
list
.
plist
);
/* Handle spurious wakeups gracefully */
ret
=
-
E
AGAIN
;
ret
=
-
E
WOULDBLOCK
;
if
(
timeout
&&
!
timeout
->
task
)
ret
=
-
ETIMEDOUT
;
else
if
(
signal_pending
(
current
))
...
...
@@ -2097,12 +2138,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initial
yl
wait on (non-pi)
* @uaddr: the futex we initial
ly
wait on (non-pi)
* @fshared: whether the futexes are shared (1) or not (0). They must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
.
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
* @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
...
...
@@ -2116,11 +2157,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
and subsequent unlock
* 3) signal
(before or after requeue)
* 4) timeout
(before or after requeue)
* 2) wakeup on uaddr2 after a requeue
* 3) signal
* 4) timeout
*
* If 3,
we setup a restart_block with futex_wait_requeue_pi() as the function
.
* If 3,
cleanup and return -ERESTARTNOINTR
.
*
* If 2, we may then block on trying to take the rt_mutex and return via:
* 5) successful lock
...
...
@@ -2128,7 +2169,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* 7) timeout
* 8) other lock acquisition failure
*
* If 6,
we setup a restart_block with futex_lock_pi() as the function
.
* If 6,
return -EWOULDBLOCK (restarting the syscall would do the same)
.
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
...
...
@@ -2167,16 +2208,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
debug_rt_mutex_init_waiter
(
&
rt_waiter
);
rt_waiter
.
task
=
NULL
;
q
.
pi_state
=
NULL
;
q
.
bitset
=
bitset
;
q
.
rt_waiter
=
&
rt_waiter
;
retry:
key2
=
FUTEX_KEY_INIT
;
ret
=
get_futex_key
(
uaddr2
,
fshared
,
&
key2
,
VERIFY_WRITE
);
if
(
unlikely
(
ret
!=
0
))
goto
out
;
q
.
pi_state
=
NULL
;
q
.
bitset
=
bitset
;
q
.
rt_waiter
=
&
rt_waiter
;
q
.
requeue_pi_key
=
&
key2
;
/* Prepare to wait on uaddr. */
ret
=
futex_wait_setup
(
uaddr
,
val
,
fshared
,
&
q
,
&
hb
);
if
(
ret
)
...
...
@@ -2229,7 +2270,7 @@ retry:
res
=
fixup_owner
(
uaddr2
,
fshared
,
&
q
,
!
ret
);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear
our
-ETIMEDOUT or -EINTR.
* acquired the lock, clear -ETIMEDOUT or -EINTR.
*/
if
(
res
)
ret
=
(
res
<
0
)
?
res
:
0
;
...
...
@@ -2247,14 +2288,11 @@ retry:
rt_mutex_unlock
(
pi_mutex
);
}
else
if
(
ret
==
-
EINTR
)
{
/*
* We've already been requeued, but we have no way to
* restart by calling futex_lock_pi() directly. We
* could restart the syscall, but that will look at
* the user space value and return right away. So we
* drop back with EWOULDBLOCK to tell user space that
* "val" has been changed. That's the same what the
* restart of the syscall would do in
* futex_wait_setup().
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but
* it would detect that the user space "val" changed and return
* -EWOULDBLOCK. Save the overhead of the restart and return
* -EWOULDBLOCK directly.
*/
ret
=
-
EWOULDBLOCK
;
}
...
...
@@ -2264,9 +2302,6 @@ out_put_keys:
out_key2:
put_futex_key
(
fshared
,
&
key2
);
/* Spurious wakeup ? */
if
(
ret
==
-
EAGAIN
)
goto
retry
;
out:
if
(
to
)
{
hrtimer_cancel
(
&
to
->
timer
);
...
...
@@ -2291,9 +2326,9 @@ out:
*/
/**
* sys_set_robust_list
- s
et the robust-futex list head of a task
* @head:
pointer to the list-head
* @len:
length of the list-head, as userspace expects
* sys_set_robust_list
() - S
et the robust-futex list head of a task
* @head:
pointer to the list-head
* @len:
length of the list-head, as userspace expects
*/
SYSCALL_DEFINE2
(
set_robust_list
,
struct
robust_list_head
__user
*
,
head
,
size_t
,
len
)
...
...
@@ -2312,10 +2347,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
}
/**
* sys_get_robust_list
- g
et the robust-futex list head of a task
* @pid:
pid of the process [zero for current task]
* @head_ptr:
pointer to a list-head pointer, the kernel fills it in
* @len_ptr:
pointer to a length field, the kernel fills in the header size
* sys_get_robust_list
() - G
et the robust-futex list head of a task
* @pid:
pid of the process [zero for current task]
* @head_ptr:
pointer to a list-head pointer, the kernel fills it in
* @len_ptr:
pointer to a length field, the kernel fills in the header size
*/
SYSCALL_DEFINE3
(
get_robust_list
,
int
,
pid
,
struct
robust_list_head
__user
*
__user
*
,
head_ptr
,
...
...
kernel/sched.c
View file @
346bf891
...
...
@@ -137,30 +137,8 @@
*/
#define RUNTIME_INF ((u64)~0ULL)
#ifdef CONFIG_SMP
static
void
double_rq_lock
(
struct
rq
*
rq1
,
struct
rq
*
rq2
);
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
*/
static
inline
u32
sg_div_cpu_power
(
const
struct
sched_group
*
sg
,
u32
load
)
{
return
reciprocal_divide
(
load
,
sg
->
reciprocal_cpu_power
);
}
/*
* Each time a sched group cpu_power is changed,
* we must compute its reciprocal value
*/
static
inline
void
sg_inc_cpu_power
(
struct
sched_group
*
sg
,
u32
val
)
{
sg
->
__cpu_power
+=
val
;
sg
->
reciprocal_cpu_power
=
reciprocal_value
(
sg
->
__cpu_power
);
}
#endif
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
...
...
@@ -673,6 +651,9 @@ struct rq {
struct
task_struct
*
migration_thread
;
struct
list_head
migration_queue
;
u64
rt_avg
;
u64
age_stamp
;
#endif
/* calc_load related fields */
...
...
@@ -926,6 +907,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
*/
unsigned
int
sysctl_sched_shares_thresh
=
4
;
/*
* period over which we average the RT time consumption, measured
* in ms.
*
* default: 1s
*/
const_debug
unsigned
int
sysctl_sched_time_avg
=
MSEC_PER_SEC
;
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
...
...
@@ -1370,12 +1359,37 @@ void wake_up_idle_cpu(int cpu)
}
#endif
/* CONFIG_NO_HZ */
static
u64
sched_avg_period
(
void
)
{
return
(
u64
)
sysctl_sched_time_avg
*
NSEC_PER_MSEC
/
2
;
}
static
void
sched_avg_update
(
struct
rq
*
rq
)
{
s64
period
=
sched_avg_period
();
while
((
s64
)(
rq
->
clock
-
rq
->
age_stamp
)
>
period
)
{
rq
->
age_stamp
+=
period
;
rq
->
rt_avg
/=
2
;
}
}
static
void
sched_rt_avg_update
(
struct
rq
*
rq
,
u64
rt_delta
)
{
rq
->
rt_avg
+=
rt_delta
;
sched_avg_update
(
rq
);
}
#else
/* !CONFIG_SMP */
static
void
resched_task
(
struct
task_struct
*
p
)
{
assert_atomic_spin_locked
(
&
task_rq
(
p
)
->
lock
);
set_tsk_need_resched
(
p
);
}
static
void
sched_rt_avg_update
(
struct
rq
*
rq
,
u64
rt_delta
)
{
}
#endif
/* CONFIG_SMP */
#if BITS_PER_LONG == 32
...
...
@@ -2365,8 +2379,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
avg_load
=
sg_div_cpu_power
(
group
,
avg_load
*
SCHED_LOAD_SCALE
);
avg_load
=
(
avg_load
*
SCHED_LOAD_SCALE
)
/
group
->
cpu_power
;
if
(
local_group
)
{
this_load
=
avg_load
;
...
...
@@ -3713,7 +3726,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if
(
sgs
->
sum_nr_running
>
sgs
->
group_capacity
-
1
)
if
(
sgs
->
sum_nr_running
+
1
>
sgs
->
group_capacity
)
return
;
if
(
sgs
->
sum_nr_running
>
sds
->
leader_nr_running
||
...
...
@@ -3781,6 +3794,94 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
#endif
/* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
unsigned
long
default_scale_freq_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
return
SCHED_LOAD_SCALE
;
}
unsigned
long
__weak
arch_scale_freq_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
return
default_scale_freq_power
(
sd
,
cpu
);
}
unsigned
long
default_scale_smt_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
unsigned
long
weight
=
cpumask_weight
(
sched_domain_span
(
sd
));
unsigned
long
smt_gain
=
sd
->
smt_gain
;
smt_gain
/=
weight
;
return
smt_gain
;
}
unsigned
long
__weak
arch_scale_smt_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
return
default_scale_smt_power
(
sd
,
cpu
);
}
unsigned
long
scale_rt_power
(
int
cpu
)
{
struct
rq
*
rq
=
cpu_rq
(
cpu
);
u64
total
,
available
;
sched_avg_update
(
rq
);
total
=
sched_avg_period
()
+
(
rq
->
clock
-
rq
->
age_stamp
);
available
=
total
-
rq
->
rt_avg
;
if
(
unlikely
((
s64
)
total
<
SCHED_LOAD_SCALE
))
total
=
SCHED_LOAD_SCALE
;
total
>>=
SCHED_LOAD_SHIFT
;
return
div_u64
(
available
,
total
);
}
static
void
update_cpu_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
unsigned
long
weight
=
cpumask_weight
(
sched_domain_span
(
sd
));
unsigned
long
power
=
SCHED_LOAD_SCALE
;
struct
sched_group
*
sdg
=
sd
->
groups
;
power
*=
arch_scale_freq_power
(
sd
,
cpu
);
power
>>=
SCHED_LOAD_SHIFT
;
if
((
sd
->
flags
&
SD_SHARE_CPUPOWER
)
&&
weight
>
1
)
{
power
*=
arch_scale_smt_power
(
sd
,
cpu
);
power
>>=
SCHED_LOAD_SHIFT
;
}
power
*=
scale_rt_power
(
cpu
);
power
>>=
SCHED_LOAD_SHIFT
;
if
(
!
power
)
power
=
1
;
sdg
->
cpu_power
=
power
;
}
static
void
update_group_power
(
struct
sched_domain
*
sd
,
int
cpu
)
{
struct
sched_domain
*
child
=
sd
->
child
;
struct
sched_group
*
group
,
*
sdg
=
sd
->
groups
;
unsigned
long
power
;
if
(
!
child
)
{
update_cpu_power
(
sd
,
cpu
);
return
;
}
power
=
0
;
group
=
child
->
groups
;
do
{
power
+=
group
->
cpu_power
;
group
=
group
->
next
;
}
while
(
group
!=
child
->
groups
);
sdg
->
cpu_power
=
power
;
}
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @group: sched_group whose statistics are to be updated.
...
...
@@ -3793,7 +3894,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static
inline
void
update_sg_lb_stats
(
struct
sched_group
*
group
,
int
this_cpu
,
static
inline
void
update_sg_lb_stats
(
struct
sched_domain
*
sd
,
struct
sched_group
*
group
,
int
this_cpu
,
enum
cpu_idle_type
idle
,
int
load_idx
,
int
*
sd_idle
,
int
local_group
,
const
struct
cpumask
*
cpus
,
int
*
balance
,
struct
sg_lb_stats
*
sgs
)
...
...
@@ -3804,8 +3906,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
unsigned
long
sum_avg_load_per_task
;
unsigned
long
avg_load_per_task
;
if
(
local_group
)
if
(
local_group
)
{
balance_cpu
=
group_first_cpu
(
group
);
if
(
balance_cpu
==
this_cpu
)
update_group_power
(
sd
,
this_cpu
);
}
/* Tally up the load of all CPUs in the group */
sum_avg_load_per_task
=
avg_load_per_task
=
0
;
...
...
@@ -3854,8 +3959,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
}
/* Adjust by relative CPU power of the group */
sgs
->
avg_load
=
sg_div_cpu_power
(
group
,
sgs
->
group_load
*
SCHED_LOAD_SCALE
);
sgs
->
avg_load
=
(
sgs
->
group_load
*
SCHED_LOAD_SCALE
)
/
group
->
cpu_power
;
/*
...
...
@@ -3867,14 +3971,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task
=
sg_div_cpu_power
(
group
,
sum_avg_load_per_task
*
SCHED_LOAD_SCALE
)
;
avg_load_per_task
=
(
sum_avg_load_per_task
*
SCHED_LOAD_SCALE
)
/
group
->
cpu_power
;
if
((
max_cpu_load
-
min_cpu_load
)
>
2
*
avg_load_per_task
)
sgs
->
group_imb
=
1
;
sgs
->
group_capacity
=
group
->
__cpu_power
/
SCHED_LOAD_SCALE
;
sgs
->
group_capacity
=
DIV_ROUND_CLOSEST
(
group
->
cpu_power
,
SCHED_LOAD_SCALE
);
}
/**
...
...
@@ -3892,9 +3996,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
const
struct
cpumask
*
cpus
,
int
*
balance
,
struct
sd_lb_stats
*
sds
)
{
struct
sched_domain
*
child
=
sd
->
child
;
struct
sched_group
*
group
=
sd
->
groups
;
struct
sg_lb_stats
sgs
;
int
load_idx
;
int
load_idx
,
prefer_sibling
=
0
;
if
(
child
&&
child
->
flags
&
SD_PREFER_SIBLING
)
prefer_sibling
=
1
;
init_sd_power_savings_stats
(
sd
,
sds
,
idle
);
load_idx
=
get_sd_load_idx
(
sd
,
idle
);
...
...
@@ -3905,14 +4013,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
local_group
=
cpumask_test_cpu
(
this_cpu
,
sched_group_cpus
(
group
));
memset
(
&
sgs
,
0
,
sizeof
(
sgs
));
update_sg_lb_stats
(
group
,
this_cpu
,
idle
,
load_idx
,
sd_idle
,
update_sg_lb_stats
(
sd
,
group
,
this_cpu
,
idle
,
load_idx
,
sd_idle
,
local_group
,
cpus
,
balance
,
&
sgs
);
if
(
local_group
&&
balance
&&
!
(
*
balance
))
return
;
sds
->
total_load
+=
sgs
.
group_load
;
sds
->
total_pwr
+=
group
->
__cpu_power
;
sds
->
total_pwr
+=
group
->
cpu_power
;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
* and move all the excess tasks away.
*/
if
(
prefer_sibling
)
sgs
.
group_capacity
=
min
(
sgs
.
group_capacity
,
1UL
);
if
(
local_group
)
{
sds
->
this_load
=
sgs
.
avg_load
;
...
...
@@ -3932,7 +4048,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
update_sd_power_savings_stats
(
group
,
sds
,
local_group
,
&
sgs
);
group
=
group
->
next
;
}
while
(
group
!=
sd
->
groups
);
}
/**
...
...
@@ -3970,28 +4085,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
* moving them.
*/
pwr_now
+=
sds
->
busiest
->
__
cpu_power
*
pwr_now
+=
sds
->
busiest
->
cpu_power
*
min
(
sds
->
busiest_load_per_task
,
sds
->
max_load
);
pwr_now
+=
sds
->
this
->
__
cpu_power
*
pwr_now
+=
sds
->
this
->
cpu_power
*
min
(
sds
->
this_load_per_task
,
sds
->
this_load
);
pwr_now
/=
SCHED_LOAD_SCALE
;
/* Amount of load we'd subtract */
tmp
=
sg_div_cpu_power
(
sds
->
busiest
,
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
;
tmp
=
(
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
/
sds
->
busiest
->
cpu_power
;
if
(
sds
->
max_load
>
tmp
)
pwr_move
+=
sds
->
busiest
->
__
cpu_power
*
pwr_move
+=
sds
->
busiest
->
cpu_power
*
min
(
sds
->
busiest_load_per_task
,
sds
->
max_load
-
tmp
);
/* Amount of load we'd add */
if
(
sds
->
max_load
*
sds
->
busiest
->
__
cpu_power
<
if
(
sds
->
max_load
*
sds
->
busiest
->
cpu_power
<
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
tmp
=
sg_div_cpu_power
(
sds
->
this
,
sds
->
max_load
*
sds
->
busiest
->
__cpu_power
)
;
tmp
=
(
sds
->
max_load
*
sds
->
busiest
->
cpu_power
)
/
sds
->
this
->
cpu_power
;
else
tmp
=
sg_div_cpu_power
(
sds
->
this
,
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
;
pwr_move
+=
sds
->
this
->
__
cpu_power
*
tmp
=
(
sds
->
busiest_load_per_task
*
SCHED_LOAD_SCALE
)
/
sds
->
this
->
cpu_power
;
pwr_move
+=
sds
->
this
->
cpu_power
*
min
(
sds
->
this_load_per_task
,
sds
->
this_load
+
tmp
);
pwr_move
/=
SCHED_LOAD_SCALE
;
...
...
@@ -4026,8 +4141,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
sds
->
max_load
-
sds
->
busiest_load_per_task
);
/* How much load to actually move to equalise the imbalance */
*
imbalance
=
min
(
max_pull
*
sds
->
busiest
->
__
cpu_power
,
(
sds
->
avg_load
-
sds
->
this_load
)
*
sds
->
this
->
__
cpu_power
)
*
imbalance
=
min
(
max_pull
*
sds
->
busiest
->
cpu_power
,
(
sds
->
avg_load
-
sds
->
this_load
)
*
sds
->
this
->
cpu_power
)
/
SCHED_LOAD_SCALE
;
/*
...
...
@@ -4145,6 +4260,26 @@ ret:
return
NULL
;
}
static
struct
sched_group
*
group_of
(
int
cpu
)
{
struct
sched_domain
*
sd
=
rcu_dereference
(
cpu_rq
(
cpu
)
->
sd
);
if
(
!
sd
)
return
NULL
;
return
sd
->
groups
;
}
static
unsigned
long
power_of
(
int
cpu
)
{
struct
sched_group
*
group
=
group_of
(
cpu
);
if
(
!
group
)
return
SCHED_LOAD_SCALE
;
return
group
->
cpu_power
;
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
...
...
@@ -4157,15 +4292,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
int
i
;
for_each_cpu
(
i
,
sched_group_cpus
(
group
))
{
unsigned
long
power
=
power_of
(
i
);
unsigned
long
capacity
=
DIV_ROUND_CLOSEST
(
power
,
SCHED_LOAD_SCALE
);
unsigned
long
wl
;
if
(
!
cpumask_test_cpu
(
i
,
cpus
))
continue
;
rq
=
cpu_rq
(
i
);
wl
=
weighted_cpuload
(
i
);
wl
=
weighted_cpuload
(
i
)
*
SCHED_LOAD_SCALE
;
wl
/=
power
;
if
(
rq
->
nr_running
==
1
&&
wl
>
imbalance
)
if
(
capacity
&&
rq
->
nr_running
==
1
&&
wl
>
imbalance
)
continue
;
if
(
wl
>
max_load
)
{
...
...
@@ -8076,7 +8214,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break
;
}
if
(
!
group
->
__
cpu_power
)
{
if
(
!
group
->
cpu_power
)
{
printk
(
KERN_CONT
"
\n
"
);
printk
(
KERN_ERR
"ERROR: domain->cpu_power not "
"set
\n
"
);
...
...
@@ -8100,9 +8238,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf
(
str
,
sizeof
(
str
),
sched_group_cpus
(
group
));
printk
(
KERN_CONT
" %s"
,
str
);
if
(
group
->
__
cpu_power
!=
SCHED_LOAD_SCALE
)
{
printk
(
KERN_CONT
" (
__
cpu_power = %d)"
,
group
->
__
cpu_power
);
if
(
group
->
cpu_power
!=
SCHED_LOAD_SCALE
)
{
printk
(
KERN_CONT
" (cpu_power = %d)"
,
group
->
cpu_power
);
}
group
=
group
->
next
;
...
...
@@ -8387,7 +8525,7 @@ init_sched_build_groups(const struct cpumask *span,
continue
;
cpumask_clear
(
sched_group_cpus
(
sg
));
sg
->
__
cpu_power
=
0
;
sg
->
cpu_power
=
0
;
for_each_cpu
(
j
,
span
)
{
if
(
group_fn
(
j
,
cpu_map
,
NULL
,
tmpmask
)
!=
group
)
...
...
@@ -8612,7 +8750,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
continue
;
}
sg
_inc_cpu_power
(
sg
,
sd
->
groups
->
__cpu_power
)
;
sg
->
cpu_power
+=
sd
->
groups
->
cpu_power
;
}
sg
=
sg
->
next
;
}
while
(
sg
!=
group_head
);
...
...
@@ -8670,15 +8808,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
* there are asymmetries in the topology. If there are asymmetries, group
* having more cpu_power will pickup more load compared to the group having
* less cpu_power.
*
* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
* the maximum number of tasks a group can handle in the presence of other idle
* or lightly loaded groups in the same sched domain.
*/
static
void
init_sched_groups_power
(
int
cpu
,
struct
sched_domain
*
sd
)
{
struct
sched_domain
*
child
;
struct
sched_group
*
group
;
long
power
;
int
weight
;
WARN_ON
(
!
sd
||
!
sd
->
groups
);
...
...
@@ -8687,28 +8823,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child
=
sd
->
child
;
sd
->
groups
->
__
cpu_power
=
0
;
sd
->
groups
->
cpu_power
=
0
;
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
* or SMT), then set this domain groups cpu_power such that each group
* can handle only one task, when there are other idle groups in the
* same sched domain.
*/
if
(
!
child
||
(
!
(
sd
->
flags
&
SD_POWERSAVINGS_BALANCE
)
&&
(
child
->
flags
&
(
SD_SHARE_CPUPOWER
|
SD_SHARE_PKG_RESOURCES
))))
{
sg_inc_cpu_power
(
sd
->
groups
,
SCHED_LOAD_SCALE
);
if
(
!
child
)
{
power
=
SCHED_LOAD_SCALE
;
weight
=
cpumask_weight
(
sched_domain_span
(
sd
));
/*
* SMT siblings share the power of a single core.
* Usually multiple threads get a better yield out of
* that one core than a single thread would have,
* reflect that in sd->smt_gain.
*/
if
((
sd
->
flags
&
SD_SHARE_CPUPOWER
)
&&
weight
>
1
)
{
power
*=
sd
->
smt_gain
;
power
/=
weight
;
power
>>=
SCHED_LOAD_SHIFT
;
}
sd
->
groups
->
cpu_power
+=
power
;
return
;
}
/*
*
add cpu_power of each child group to this groups cpu_power
*
Add cpu_power of each child group to this groups cpu_power.
*/
group
=
child
->
groups
;
do
{
s
g_inc_cpu_power
(
sd
->
groups
,
group
->
__cpu_power
)
;
s
d
->
groups
->
cpu_power
+=
group
->
cpu_power
;
group
=
group
->
next
;
}
while
(
group
!=
child
->
groups
);
}
...
...
@@ -8981,7 +9121,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
sd
=
&
per_cpu
(
node_domains
,
j
).
sd
;
sd
->
groups
=
sg
;
}
sg
->
__
cpu_power
=
0
;
sg
->
cpu_power
=
0
;
cpumask_copy
(
sched_group_cpus
(
sg
),
nodemask
);
sg
->
next
=
sg
;
cpumask_or
(
covered
,
covered
,
nodemask
);
...
...
@@ -9008,7 +9148,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
"Can not alloc domain group for node %d
\n
"
,
j
);
goto
error
;
}
sg
->
__
cpu_power
=
0
;
sg
->
cpu_power
=
0
;
cpumask_copy
(
sched_group_cpus
(
sg
),
tmpmask
);
sg
->
next
=
prev
->
next
;
cpumask_or
(
covered
,
covered
,
tmpmask
);
...
...
kernel/sched_fair.c
View file @
346bf891
...
...
@@ -1040,39 +1040,58 @@ static void yield_task_fair(struct rq *rq)
se
->
vruntime
=
rightmost
->
vruntime
+
1
;
}
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*
* XXX - can generate significant overload on perferred_wakeup_cpu
* with plenty of idle cpus, leading to a significant loss in
* throughput.
*
* Returns: < 0 - no placement decision made
* >= 0 - place on cpu
*/
static
int
wake_idle_power_save
(
int
cpu
,
struct
task_struct
*
p
)
{
int
this_cpu
=
smp_processor_id
();
int
wakeup_cpu
;
if
(
sched_mc_power_savings
<
POWERSAVINGS_BALANCE_WAKEUP
)
return
-
1
;
if
(
!
idle_cpu
(
cpu
)
||
!
idle_cpu
(
this_cpu
))
return
-
1
;
if
(
!
p
->
mm
||
(
p
->
flags
&
PF_KTHREAD
))
return
-
1
;
wakeup_cpu
=
cpu_rq
(
this_cpu
)
->
rd
->
sched_mc_preferred_wakeup_cpu
;
if
(
!
cpu_isset
(
wakeup_cpu
,
p
->
cpus_allowed
))
return
-
1
;
return
wakeup_cpu
;
}
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
* hence we need to mask them out (cpu_active_mask)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static
int
wake_idle
(
int
cpu
,
struct
task_struct
*
p
)
{
struct
sched_domain
*
sd
;
struct
rq
*
task_rq
=
task_rq
(
p
);
struct
sched_domain
*
sd
,
*
child
=
NULL
;
int
i
;
unsigned
int
chosen_wakeup_cpu
;
int
this_cpu
;
/*
* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
* are idle and this is not a kernel thread and this task's affinity
* allows it to be moved to preferred cpu, then just move!
*/
this_cpu
=
smp_processor_id
();
chosen_wakeup_cpu
=
cpu_rq
(
this_cpu
)
->
rd
->
sched_mc_preferred_wakeup_cpu
;
if
(
sched_mc_power_savings
>=
POWERSAVINGS_BALANCE_WAKEUP
&&
idle_cpu
(
cpu
)
&&
idle_cpu
(
this_cpu
)
&&
p
->
mm
&&
!
(
p
->
flags
&
PF_KTHREAD
)
&&
cpu_isset
(
chosen_wakeup_cpu
,
p
->
cpus_allowed
))
return
chosen_wakeup_cpu
;
i
=
wake_idle_power_save
(
cpu
,
p
);
if
(
i
>=
0
)
return
i
;
/*
* If it is idle, then it is the best cpu to run this task.
...
...
@@ -1081,29 +1100,39 @@ static int wake_idle(int cpu, struct task_struct *p)
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penal
i
ties associated with that.
* penalties associated with that.
*/
if
(
idle_cpu
(
cpu
)
||
cpu_rq
(
cpu
)
->
cfs
.
nr_running
>
1
)
return
cpu
;
for_each_domain
(
cpu
,
sd
)
{
if
((
sd
->
flags
&
SD_WAKE_IDLE
)
||
((
sd
->
flags
&
SD_WAKE_IDLE_FAR
)
&&
!
task_hot
(
p
,
task_rq
(
p
)
->
clock
,
sd
)))
{
for_each_cpu_and
(
i
,
sched_domain_span
(
sd
),
&
p
->
cpus_allowed
)
{
if
(
cpu_active
(
i
)
&&
idle_cpu
(
i
))
{
if
(
i
!=
task_cpu
(
p
))
{
schedstat_inc
(
p
,
se
.
nr_wakeups_idle
);
}
return
i
;
}
}
}
else
{
rcu_read_lock
();
for_each_domain
(
cpu
,
sd
)
{
if
(
!
(
sd
->
flags
&
SD_LOAD_BALANCE
))
break
;
if
(
!
(
sd
->
flags
&
SD_WAKE_IDLE
)
&&
(
task_hot
(
p
,
task_rq
->
clock
,
sd
)
||
!
(
sd
->
flags
&
SD_WAKE_IDLE_FAR
)))
break
;
}
}
for_each_cpu_and
(
i
,
sched_domain_span
(
sd
),
&
p
->
cpus_allowed
)
{
if
(
child
&&
cpumask_test_cpu
(
i
,
sched_domain_span
(
child
)))
continue
;
if
(
!
idle_cpu
(
i
))
continue
;
if
(
task_cpu
(
p
)
!=
i
)
schedstat_inc
(
p
,
se
.
nr_wakeups_idle
);
cpu
=
i
;
goto
unlock
;
}
child
=
sd
;
}
unlock:
rcu_read_unlock
();
return
cpu
;
}
#else
/* !ARCH_HAS_SCHED_WAKE_IDLE*/
...
...
@@ -1235,7 +1264,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
tg
=
task_group
(
p
);
weight
=
p
->
se
.
load
.
weight
;
balanced
=
100
*
(
tl
+
effective_load
(
tg
,
this_cpu
,
weight
,
weight
))
<=
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped tl to 0, we'll always have
* an imbalance, but there's really nothing you can do about that, so
* that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
balanced
=
!
tl
||
100
*
(
tl
+
effective_load
(
tg
,
this_cpu
,
weight
,
weight
))
<=
imbalance
*
(
load
+
effective_load
(
tg
,
prev_cpu
,
0
,
weight
));
/*
...
...
kernel/sched_rt.c
View file @
346bf891
...
...
@@ -602,6 +602,8 @@ static void update_curr_rt(struct rq *rq)
curr
->
se
.
exec_start
=
rq
->
clock
;
cpuacct_charge
(
curr
,
delta_exec
);
sched_rt_avg_update
(
rq
,
delta_exec
);
if
(
!
rt_bandwidth_enabled
())
return
;
...
...
@@ -926,8 +928,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
if
(
!
task_current
(
rq
,
p
)
&&
p
->
rt
.
nr_cpus_allowed
>
1
)
enqueue_pushable_task
(
rq
,
p
);
inc_cpu_load
(
rq
,
p
->
se
.
load
.
weight
);
}
static
void
dequeue_task_rt
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
sleep
)
...
...
@@ -942,8 +942,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
dequeue_rt_entity
(
rt_se
);
dequeue_pushable_task
(
rq
,
p
);
dec_cpu_load
(
rq
,
p
->
se
.
load
.
weight
);
}
/*
...
...
kernel/sysctl.c
View file @
346bf891
...
...
@@ -330,6 +330,14 @@ static struct ctl_table kern_table[] = {
.
mode
=
0644
,
.
proc_handler
=
&
proc_dointvec
,
},
{
.
ctl_name
=
CTL_UNNUMBERED
,
.
procname
=
"sched_time_avg"
,
.
data
=
&
sysctl_sched_time_avg
,
.
maxlen
=
sizeof
(
unsigned
int
),
.
mode
=
0644
,
.
proc_handler
=
&
proc_dointvec
,
},
{
.
ctl_name
=
CTL_UNNUMBERED
,
.
procname
=
"timer_migration"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment