Commit e1367daf authored by Li Shaohua's avatar Li Shaohua Committed by Linus Torvalds

[PATCH] cpu state clean after hot remove

Clean CPU states in order to reuse smp boot code for CPU hotplug.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 0bb3184d
...@@ -651,3 +651,15 @@ void __devinit cpu_init(void) ...@@ -651,3 +651,15 @@ void __devinit cpu_init(void)
clear_used_math(); clear_used_math();
mxcsr_feature_mask_init(); mxcsr_feature_mask_init();
} }
#ifdef CONFIG_HOTPLUG_CPU
void __devinit cpu_uninit(void)
{
int cpu = raw_smp_processor_id();
cpu_clear(cpu, cpu_initialized);
/* lazy TLB state */
per_cpu(cpu_tlbstate, cpu).state = 0;
per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
}
#endif
...@@ -156,6 +156,11 @@ void irq_ctx_init(int cpu) ...@@ -156,6 +156,11 @@ void irq_ctx_init(int cpu)
cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
} }
void irq_ctx_exit(int cpu)
{
hardirq_ctx[cpu] = NULL;
}
extern asmlinkage void __do_softirq(void); extern asmlinkage void __do_softirq(void);
asmlinkage void do_softirq(void) asmlinkage void do_softirq(void)
......
...@@ -152,21 +152,19 @@ static void poll_idle (void) ...@@ -152,21 +152,19 @@ static void poll_idle (void)
/* We don't actually take CPU down, just spin without interrupts. */ /* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void) static inline void play_dead(void)
{ {
/* This must be done before dead CPU ack */
cpu_exit_clear();
wbinvd();
mb();
/* Ack it */ /* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD; __get_cpu_var(cpu_state) = CPU_DEAD;
/* We shouldn't have to disable interrupts while dead, but /*
* some interrupts just don't seem to go away, and this makes * With physical CPU hotplug, we should halt the cpu
* it "work" for testing purposes. */ */
/* Death loop */
while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
cpu_relax();
local_irq_disable(); local_irq_disable();
__flush_tlb_all(); while (1)
cpu_set(smp_processor_id(), cpu_online_map); __asm__ __volatile__("hlt":::"memory");
enable_APIC_timer();
local_irq_enable();
} }
#else #else
static inline void play_dead(void) static inline void play_dead(void)
......
...@@ -90,6 +90,12 @@ cpumask_t cpu_callout_map; ...@@ -90,6 +90,12 @@ cpumask_t cpu_callout_map;
EXPORT_SYMBOL(cpu_callout_map); EXPORT_SYMBOL(cpu_callout_map);
static cpumask_t smp_commenced_mask; static cpumask_t smp_commenced_mask;
/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
* is no way to resync one AP against BP. TBD: for prescott and above, we
* should use IA64's algorithm
*/
static int __devinitdata tsc_sync_disabled;
/* Per CPU bogomips and other parameters */ /* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_data);
...@@ -427,7 +433,7 @@ static void __devinit smp_callin(void) ...@@ -427,7 +433,7 @@ static void __devinit smp_callin(void)
/* /*
* Synchronize the TSC with the BP * Synchronize the TSC with the BP
*/ */
if (cpu_has_tsc && cpu_khz) if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
synchronize_tsc_ap(); synchronize_tsc_ap();
} }
...@@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused) ...@@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused)
lock_ipi_call_lock(); lock_ipi_call_lock();
cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_online_map);
unlock_ipi_call_lock(); unlock_ipi_call_lock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
/* We can take interrupts now: we're officially "up". */ /* We can take interrupts now: we're officially "up". */
local_irq_enable(); local_irq_enable();
...@@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) ...@@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
#endif /* WAKE_SECONDARY_VIA_INIT */ #endif /* WAKE_SECONDARY_VIA_INIT */
extern cpumask_t cpu_initialized; extern cpumask_t cpu_initialized;
static inline int alloc_cpu_id(void)
{
cpumask_t tmp_map;
int cpu;
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
if (cpu >= NR_CPUS)
return -ENODEV;
return cpu;
}
#ifdef CONFIG_HOTPLUG_CPU
static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
static inline struct task_struct * alloc_idle_task(int cpu)
{
struct task_struct *idle;
static int __devinit do_boot_cpu(int apicid) if ((idle = cpu_idle_tasks[cpu]) != NULL) {
/* initialize thread_struct. we really want to avoid destroy
* idle tread
*/
idle->thread.esp = (unsigned long)(((struct pt_regs *)
(THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
init_idle(idle, cpu);
return idle;
}
idle = fork_idle(cpu);
if (!IS_ERR(idle))
cpu_idle_tasks[cpu] = idle;
return idle;
}
#else
#define alloc_idle_task(cpu) fork_idle(cpu)
#endif
static int __devinit do_boot_cpu(int apicid, int cpu)
/* /*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID. * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
...@@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid) ...@@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid)
{ {
struct task_struct *idle; struct task_struct *idle;
unsigned long boot_error; unsigned long boot_error;
int timeout, cpu; int timeout;
unsigned long start_eip; unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0; unsigned short nmi_high = 0, nmi_low = 0;
cpu = ++cpucount; ++cpucount;
/* /*
* We can't use kernel_thread since we must avoid to * We can't use kernel_thread since we must avoid to
* reschedule the child. * reschedule the child.
*/ */
idle = fork_idle(cpu); idle = alloc_idle_task(cpu);
if (IS_ERR(idle)) if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu); panic("failed fork for CPU %d", cpu);
idle->thread.eip = (unsigned long) start_secondary; idle->thread.eip = (unsigned long) start_secondary;
...@@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid) ...@@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid)
inquire_remote_apic(apicid); inquire_remote_apic(apicid);
} }
} }
x86_cpu_to_apicid[cpu] = apicid;
if (boot_error) { if (boot_error) {
/* Try to put things back the way they were before ... */ /* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid(cpu); unmap_cpu_to_logical_apicid(cpu);
cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
cpucount--; cpucount--;
} else {
x86_cpu_to_apicid[cpu] = apicid;
cpu_set(cpu, cpu_present_map);
} }
/* mark "stuck" area as not stuck */ /* mark "stuck" area as not stuck */
...@@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid) ...@@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid)
return boot_error; return boot_error;
} }
#ifdef CONFIG_HOTPLUG_CPU
void cpu_exit_clear(void)
{
int cpu = raw_smp_processor_id();
idle_task_exit();
cpucount --;
cpu_uninit();
irq_ctx_exit(cpu);
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
cpu_clear(cpu, cpu_present_map);
cpu_clear(cpu, smp_commenced_mask);
unmap_cpu_to_logical_apicid(cpu);
}
struct warm_boot_cpu_info {
struct completion *complete;
int apicid;
int cpu;
};
static void __devinit do_warm_boot_cpu(void *p)
{
struct warm_boot_cpu_info *info = p;
do_boot_cpu(info->apicid, info->cpu);
complete(info->complete);
}
int __devinit smp_prepare_cpu(int cpu)
{
DECLARE_COMPLETION(done);
struct warm_boot_cpu_info info;
struct work_struct task;
int apicid, ret;
lock_cpu_hotplug();
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
ret = -ENODEV;
goto exit;
}
info.complete = &done;
info.apicid = apicid;
info.cpu = cpu;
INIT_WORK(&task, do_warm_boot_cpu, &info);
tsc_sync_disabled = 1;
/* init low mem mapping */
memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
flush_tlb_all();
schedule_work(&task);
wait_for_completion(&done);
tsc_sync_disabled = 0;
zap_low_mappings();
ret = 0;
exit:
unlock_cpu_hotplug();
return ret;
}
#endif
static void smp_tune_scheduling (void) static void smp_tune_scheduling (void)
{ {
unsigned long cachesize; /* kB */ unsigned long cachesize; /* kB */
...@@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) ...@@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
if (max_cpus <= cpucount+1) if (max_cpus <= cpucount+1)
continue; continue;
if (do_boot_cpu(apicid)) if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
printk("CPU #%d not responding - cannot use it.\n", printk("CPU #%d not responding - cannot use it.\n",
apicid); apicid);
else else
...@@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void) ...@@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void)
{ {
cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_online_map);
cpu_set(smp_processor_id(), cpu_callout_map); cpu_set(smp_processor_id(), cpu_callout_map);
cpu_set(smp_processor_id(), cpu_present_map);
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
} }
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
static void
/* must be called with the cpucontrol mutex held */ remove_siblinginfo(int cpu)
static int __devinit cpu_enable(unsigned int cpu)
{ {
/* get the target out of its holding state */ int sibling;
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
wmb();
/* wait for the processor to ack it. timeout? */
while (!cpu_online(cpu))
cpu_relax();
fixup_irqs(cpu_online_map); for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
/* counter the disable in fixup_irqs() */ cpu_clear(cpu, cpu_sibling_map[sibling]);
local_irq_enable(); for_each_cpu_mask(sibling, cpu_core_map[cpu])
return 0; cpu_clear(cpu, cpu_core_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
phys_proc_id[cpu] = BAD_APICID;
cpu_core_id[cpu] = BAD_APICID;
} }
int __cpu_disable(void) int __cpu_disable(void)
...@@ -1193,6 +1307,8 @@ int __cpu_disable(void) ...@@ -1193,6 +1307,8 @@ int __cpu_disable(void)
mdelay(1); mdelay(1);
local_irq_disable(); local_irq_disable();
remove_siblinginfo(cpu);
cpu_clear(cpu, map); cpu_clear(cpu, map);
fixup_irqs(map); fixup_irqs(map);
/* It's now safe to remove this processor from the online map */ /* It's now safe to remove this processor from the online map */
...@@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu) ...@@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu)
for (i = 0; i < 10; i++) { for (i = 0; i < 10; i++) {
/* They ack this in play_dead by setting CPU_DEAD */ /* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
return; return;
}
current->state = TASK_UNINTERRUPTIBLE; current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10); schedule_timeout(HZ/10);
} }
...@@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu) ...@@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu)
return -EIO; return -EIO;
} }
#ifdef CONFIG_HOTPLUG_CPU
/* Already up, and in cpu_quiescent now? */
if (cpu_isset(cpu, smp_commenced_mask)) {
cpu_enable(cpu);
return 0;
}
#endif
local_irq_enable(); local_irq_enable();
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Unleash the CPU! */ /* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask); cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map)) while (!cpu_isset(cpu, cpu_online_map))
...@@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus) ...@@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
setup_ioapic_dest(); setup_ioapic_dest();
#endif #endif
zap_low_mappings(); zap_low_mappings();
#ifndef CONFIG_HOTPLUG_CPU
/* /*
* Disable executability of the SMP trampoline: * Disable executability of the SMP trampoline:
*/ */
set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
#endif
} }
void __init smp_intr_init(void) void __init smp_intr_init(void)
......
...@@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = { ...@@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = {
EXPORT_SYMBOL(cpu_sysdev_class); EXPORT_SYMBOL(cpu_sysdev_class);
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
#ifndef __HAVE_ARCH_SMP_PREPARE_CPU
#define smp_prepare_cpu(cpu) (0)
#endif
static ssize_t show_online(struct sys_device *dev, char *buf) static ssize_t show_online(struct sys_device *dev, char *buf)
{ {
struct cpu *cpu = container_of(dev, struct cpu, sysdev); struct cpu *cpu = container_of(dev, struct cpu, sysdev);
...@@ -36,6 +40,8 @@ static ssize_t store_online(struct sys_device *dev, const char *buf, ...@@ -36,6 +40,8 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
kobject_hotplug(&dev->kobj, KOBJ_OFFLINE); kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
break; break;
case '1': case '1':
ret = smp_prepare_cpu(cpu->sysdev.id);
if (ret == 0)
ret = cpu_up(cpu->sysdev.id); ret = cpu_up(cpu->sysdev.id);
break; break;
default: default:
......
...@@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *); ...@@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *);
#ifdef CONFIG_4KSTACKS #ifdef CONFIG_4KSTACKS
extern void irq_ctx_init(int cpu); extern void irq_ctx_init(int cpu);
extern void irq_ctx_exit(int cpu);
# define __ARCH_HAS_DO_SOFTIRQ # define __ARCH_HAS_DO_SOFTIRQ
#else #else
# define irq_ctx_init(cpu) do { } while (0) # define irq_ctx_init(cpu) do { } while (0)
# define irq_ctx_exit(cpu) do { } while (0)
#endif #endif
#ifdef CONFIG_IRQBALANCE #ifdef CONFIG_IRQBALANCE
......
...@@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void); ...@@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void);
#define MAX_APICID 256 #define MAX_APICID 256
extern u8 x86_cpu_to_apicid[]; extern u8 x86_cpu_to_apicid[];
#ifdef CONFIG_HOTPLUG_CPU
extern void cpu_exit_clear(void);
extern void cpu_uninit(void);
#define __HAVE_ARCH_SMP_PREPARE_CPU
extern int smp_prepare_cpu(int cpu);
#endif
/* /*
* This function is needed by all SMP systems. It must _always_ be valid * This function is needed by all SMP systems. It must _always_ be valid
* from the initial startup. We map APIC_BASE very early in page_setup(), * from the initial startup. We map APIC_BASE very early in page_setup(),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment