Commit d99015b1 authored by Alexander van Heukelum's avatar Alexander van Heukelum Committed by Ingo Molnar

x86: move entry_64.S register saving out of the macros

Here is a combined patch that moves "save_args" out-of-line for
the interrupt macro and moves "error_entry" mostly out-of-line
for the zeroentry and errorentry macros.

The save_args function becomes really straightforward and easy
to understand, with the possible exception of the stack switch
code, which now needs to copy the return address of to the
calling function. Normal interrupts arrive with ((~vector)-0x80)
on the stack, which gets adjusted in common_interrupt:

<common_interrupt>:
(5)  addq   $0xffffffffffffff80,(%rsp)		/* -> ~(vector) */
(4)  sub    $0x50,%rsp				/* space for registers */
(5)  callq  ffffffff80211290 <save_args>
(5)  callq  ffffffff80214290 <do_IRQ>
<ret_from_intr>:
     ...

An apic interrupt stub now look like this:

<thermal_interrupt>:
(5)  pushq  $0xffffffffffffff05			/* ~(vector) */
(4)  sub    $0x50,%rsp				/* space for registers */
(5)  callq  ffffffff80211290 <save_args>
(5)  callq  ffffffff80212b8f <smp_thermal_interrupt>
(5)  jmpq   ffffffff80211f93 <ret_from_intr>

Similarly the exception handler register saving function becomes
simpler, without the need of any parameter shuffling. The stub
for an exception without errorcode looks like this:

<overflow>:
(6)  callq  *0x1cad12(%rip)        # ffffffff803dd448 <pv_irq_ops+0x38>
(2)  pushq  $0xffffffffffffffff			/* no syscall */
(4)  sub    $0x78,%rsp				/* space for registers */
(5)  callq  ffffffff8030e3b0 <error_entry>
(3)  mov    %rsp,%rdi				/* pt_regs pointer */
(2)  xor    %esi,%esi				/* no error code */
(5)  callq  ffffffff80213446 <do_overflow>
(5)  jmpq   ffffffff8030e460 <error_exit>

And one for an exception with errorcode like this:

<segment_not_present>:
(6)  callq  *0x1cab92(%rip)        # ffffffff803dd448 <pv_irq_ops+0x38>
(4)  sub    $0x78,%rsp				/* space for registers */
(5)  callq  ffffffff8030e3b0 <error_entry>
(3)  mov    %rsp,%rdi				/* pt_regs pointer */
(5)  mov    0x78(%rsp),%rsi			/* load error code */
(9)  movq   $0xffffffffffffffff,0x78(%rsp)	/* no syscall */
(5)  callq  ffffffff80213209 <do_segment_not_present>
(5)  jmpq   ffffffff8030e460 <error_exit>

Unfortunately, this last type is more than 32 bytes. But the total space
savings due to this patch is about 2500 bytes on an smp-configuration,
and I think the code is clearer than it was before. The tested kernels
were non-paravirt ones (i.e., without the indirect call at the top of
the exception handlers).

Anyhow, I tested this patch on top of a recent -tip. The machine
was an 2x4-core Xeon at 2333MHz. Measured where the delays between
(almost-)adjacent rdtsc instructions. The graphs show how much
time is spent outside of the program as a function of the measured
delay. The area under the graph represents the total time spent
outside the program. Eight instances of the rdtsctest were
started, each pinned to a single cpu. The histogams are added.
For each kernel two measurements were done: one in mostly idle
condition, the other while running "bonnie++ -f", bound to cpu 0.
Each measurement took 40 minutes runtime. See the attached graphs
for the results. The graphs overlap almost everywhere, but there
are small differences.
Signed-off-by: default avatarAlexander van Heukelum <heukelum@fastmail.fm>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent c032a2de
...@@ -242,6 +242,78 @@ ENTRY(native_usergs_sysret64) ...@@ -242,6 +242,78 @@ ENTRY(native_usergs_sysret64)
CFI_REL_OFFSET rsp,RSP CFI_REL_OFFSET rsp,RSP
/*CFI_REL_OFFSET ss,SS*/ /*CFI_REL_OFFSET ss,SS*/
.endm .endm
/*
* initial frame state for interrupts and exceptions
*/
.macro _frame ref
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-\ref
/*CFI_REL_OFFSET ss,SS-\ref*/
CFI_REL_OFFSET rsp,RSP-\ref
/*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
/*CFI_REL_OFFSET cs,CS-\ref*/
CFI_REL_OFFSET rip,RIP-\ref
.endm
/*
* initial frame state for interrupts (and exceptions without error code)
*/
#define INTR_FRAME _frame RIP
/*
* initial frame state for exceptions with error code (and interrupts
* with vector already pushed)
*/
#define XCPT_FRAME _frame ORIG_RAX
/* save partial stack frame */
ENTRY(save_args)
XCPT_FRAME
cld
movq %rdi, 8*8+16(%rsp)
CFI_REL_OFFSET rdi, 8*8+16
movq %rsi, 7*8+16(%rsp)
CFI_REL_OFFSET rsi, 7*8+16
movq %rdx, 6*8+16(%rsp)
CFI_REL_OFFSET rdx, 6*8+16
movq %rcx, 5*8+16(%rsp)
CFI_REL_OFFSET rcx, 5*8+16
movq %rax, 4*8+16(%rsp)
CFI_REL_OFFSET rax, 4*8+16
movq %r8, 3*8+16(%rsp)
CFI_REL_OFFSET r8, 3*8+16
movq %r9, 2*8+16(%rsp)
CFI_REL_OFFSET r9, 2*8+16
movq %r10, 1*8+16(%rsp)
CFI_REL_OFFSET r10, 1*8+16
movq %r11, 0*8+16(%rsp)
CFI_REL_OFFSET r11, 0*8+16
leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
movq %rbp, 8(%rsp) /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
je 1f
SWAPGS
/*
* irqcount is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
1: incl %gs:pda_irqcount
jne 2f
pop %rax /* move return address... */
mov %gs:pda_irqstackptr,%rsp
push %rax /* ... to the new stack */
/*
* We entered an interrupt context - irqs are off:
*/
2: TRACE_IRQS_OFF
ret
CFI_ENDPROC
END(save_args)
/* /*
* A newly forked process directly context switches into this. * A newly forked process directly context switches into this.
*/ */
...@@ -607,26 +679,6 @@ ENTRY(stub_rt_sigreturn) ...@@ -607,26 +679,6 @@ ENTRY(stub_rt_sigreturn)
CFI_ENDPROC CFI_ENDPROC
END(stub_rt_sigreturn) END(stub_rt_sigreturn)
/*
* initial frame state for interrupts and exceptions
*/
.macro _frame ref
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-\ref
/*CFI_REL_OFFSET ss,SS-\ref*/
CFI_REL_OFFSET rsp,RSP-\ref
/*CFI_REL_OFFSET rflags,EFLAGS-\ref*/
/*CFI_REL_OFFSET cs,CS-\ref*/
CFI_REL_OFFSET rip,RIP-\ref
.endm
/* initial frame state for interrupts (and exceptions without error code) */
#define INTR_FRAME _frame RIP
/* initial frame state for exceptions with error code (and interrupts with
vector already pushed) */
#define XCPT_FRAME _frame ORIG_RAX
/* /*
* Build the entry stubs and pointer table with some assembler magic. * Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a * We pack 7 stubs into a single 32-byte chunk, which will fit in a
...@@ -667,46 +719,19 @@ END(irq_entries_start) ...@@ -667,46 +719,19 @@ END(irq_entries_start)
END(interrupt) END(interrupt)
.previous .previous
/* /*
* Interrupt entry/exit. * Interrupt entry/exit.
* *
* Interrupt entry points save only callee clobbered registers in fast path. * Interrupt entry points save only callee clobbered registers in fast path.
* *
* Entry runs with interrupts off. * Entry runs with interrupts off.
*/ */
/* 0(%rsp): ~(interrupt number) */ /* 0(%rsp): ~(interrupt number) */
.macro interrupt func .macro interrupt func
cld subq $10*8, %rsp
SAVE_ARGS CFI_ADJUST_CFA_OFFSET 10*8
leaq -ARGOFFSET(%rsp),%rdi /* arg1 for handler */ call save_args
pushq %rbp
/*
* Save rbp twice: One is for marking the stack frame, as usual, and the
* other, to fill pt_regs properly. This is because bx comes right
* before the last saved register in that structure, and not bp. If the
* base pointer were in the place bx is today, this would not be needed.
*/
movq %rbp, -8(%rsp)
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rbp, 0
movq %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
testl $3,CS(%rdi)
je 1f
SWAPGS
/* irqcount is used to check if a CPU is already on an interrupt
stack or not. While this is essentially redundant with preempt_count
it is a little cheaper to use a separate counter in the PDA
(short of moving irq_enter into assembly, which would be too
much work) */
1: incl %gs:pda_irqcount
cmoveq %gs:pda_irqstackptr,%rsp
push %rbp # backlink for old unwinder
/*
* We entered an interrupt context - irqs are off:
*/
TRACE_IRQS_OFF
call \func call \func
.endm .endm
...@@ -852,6 +877,8 @@ END(common_interrupt) ...@@ -852,6 +877,8 @@ END(common_interrupt)
/* /*
* APIC interrupts. * APIC interrupts.
*/ */
.p2align 5
.macro apicinterrupt num,func .macro apicinterrupt num,func
INTR_FRAME INTR_FRAME
pushq $~(\num) pushq $~(\num)
...@@ -922,24 +949,29 @@ END(spurious_interrupt) ...@@ -922,24 +949,29 @@ END(spurious_interrupt)
.macro zeroentry sym .macro zeroentry sym
INTR_FRAME INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq $0 /* push error code/oldrax */ pushq $-1 /* ORIG_RAX: no syscall to restart */
CFI_ADJUST_CFA_OFFSET 8 CFI_ADJUST_CFA_OFFSET 8
pushq %rax /* push real oldrax to the rdi slot */ subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 8 CFI_ADJUST_CFA_OFFSET 15*8
CFI_REL_OFFSET rax,0 call error_entry
leaq \sym(%rip),%rax movq %rsp,%rdi /* pt_regs pointer */
jmp error_entry xorl %esi,%esi /* no error code */
call \sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC CFI_ENDPROC
.endm .endm
.macro errorentry sym .macro errorentry sym
XCPT_FRAME XCPT_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME
pushq %rax subq $15*8,%rsp
CFI_ADJUST_CFA_OFFSET 8 CFI_ADJUST_CFA_OFFSET 15*8
CFI_REL_OFFSET rax,0 call error_entry
leaq \sym(%rip),%rax movq %rsp,%rdi /* pt_regs pointer */
jmp error_entry movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
call \sym
jmp error_exit /* %ebx: no swapgs flag */
CFI_ENDPROC CFI_ENDPROC
.endm .endm
...@@ -1043,93 +1075,93 @@ paranoid_schedule\trace: ...@@ -1043,93 +1075,93 @@ paranoid_schedule\trace:
.endm .endm
/* /*
* Exception entry point. This expects an error code/orig_rax on the stack * Exception entry point. This expects an error code/orig_rax on the stack.
* and the exception handler in %rax. * returns in "no swapgs flag" in %ebx.
*/ */
KPROBE_ENTRY(error_entry) KPROBE_ENTRY(error_entry)
_frame RDI _frame RDI
CFI_REL_OFFSET rax,0 CFI_ADJUST_CFA_OFFSET 15*8
/* rdi slot contains rax, oldrax contains error code */ /* oldrax contains error code */
cld cld
subq $14*8,%rsp movq %rdi,14*8+8(%rsp)
CFI_ADJUST_CFA_OFFSET (14*8) CFI_REL_OFFSET rdi,RDI+8
movq %rsi,13*8(%rsp) movq %rsi,13*8+8(%rsp)
CFI_REL_OFFSET rsi,RSI CFI_REL_OFFSET rsi,RSI+8
movq 14*8(%rsp),%rsi /* load rax from rdi slot */ movq %rdx,12*8+8(%rsp)
CFI_REGISTER rax,rsi CFI_REL_OFFSET rdx,RDX+8
movq %rdx,12*8(%rsp) movq %rcx,11*8+8(%rsp)
CFI_REL_OFFSET rdx,RDX CFI_REL_OFFSET rcx,RCX+8
movq %rcx,11*8(%rsp) movq %rax,10*8+8(%rsp)
CFI_REL_OFFSET rcx,RCX CFI_REL_OFFSET rax,RAX+8
movq %rsi,10*8(%rsp) /* store rax */ movq %r8, 9*8+8(%rsp)
CFI_REL_OFFSET rax,RAX CFI_REL_OFFSET r8,R8+8
movq %r8, 9*8(%rsp) movq %r9, 8*8+8(%rsp)
CFI_REL_OFFSET r8,R8 CFI_REL_OFFSET r9,R9+8
movq %r9, 8*8(%rsp) movq %r10,7*8+8(%rsp)
CFI_REL_OFFSET r9,R9 CFI_REL_OFFSET r10,R10+8
movq %r10,7*8(%rsp) movq %r11,6*8+8(%rsp)
CFI_REL_OFFSET r10,R10 CFI_REL_OFFSET r11,R11+8
movq %r11,6*8(%rsp) movq %rbx,5*8+8(%rsp)
CFI_REL_OFFSET r11,R11 CFI_REL_OFFSET rbx,RBX+8
movq %rbx,5*8(%rsp) movq %rbp,4*8+8(%rsp)
CFI_REL_OFFSET rbx,RBX CFI_REL_OFFSET rbp,RBP+8
movq %rbp,4*8(%rsp) movq %r12,3*8+8(%rsp)
CFI_REL_OFFSET rbp,RBP CFI_REL_OFFSET r12,R12+8
movq %r12,3*8(%rsp) movq %r13,2*8+8(%rsp)
CFI_REL_OFFSET r12,R12 CFI_REL_OFFSET r13,R13+8
movq %r13,2*8(%rsp) movq %r14,1*8+8(%rsp)
CFI_REL_OFFSET r13,R13 CFI_REL_OFFSET r14,R14+8
movq %r14,1*8(%rsp) movq %r15,0*8+8(%rsp)
CFI_REL_OFFSET r14,R14 CFI_REL_OFFSET r15,R15+8
movq %r15,(%rsp)
CFI_REL_OFFSET r15,R15
xorl %ebx,%ebx xorl %ebx,%ebx
testl $3,CS(%rsp) testl $3,CS+8(%rsp)
je error_kernelspace je error_kernelspace
error_swapgs: error_swapgs:
SWAPGS SWAPGS
error_sti: error_sti:
TRACE_IRQS_OFF TRACE_IRQS_OFF
movq %rdi,RDI(%rsp) ret
CFI_REL_OFFSET rdi,RDI CFI_ENDPROC
movq %rsp,%rdi
movq ORIG_RAX(%rsp),%rsi /* get error code */ /*
movq $-1,ORIG_RAX(%rsp) * There are two places in the kernel that can potentially fault with
call *%rax * usergs. Handle them here. The exception handlers after iret run with
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ * kernel gs again, so don't set the user space flag. B stepping K8s
error_exit: * sometimes report an truncated RIP for IRET exceptions returning to
* compat mode. Check for these here too.
*/
error_kernelspace:
incl %ebx
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%ecx /* zero extend */
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
cmpq $gs_change,RIP+8(%rsp)
je error_swapgs
jmp error_sti
KPROBE_END(error_entry)
/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
KPROBE_ENTRY(error_exit)
_frame R15
movl %ebx,%eax movl %ebx,%eax
RESTORE_REST RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE) DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx) GET_THREAD_INFO(%rcx)
testl %eax,%eax testl %eax,%eax
jne retint_kernel jne retint_kernel
LOCKDEP_SYS_EXIT_IRQ LOCKDEP_SYS_EXIT_IRQ
movl TI_flags(%rcx),%edx movl TI_flags(%rcx),%edx
movl $_TIF_WORK_MASK,%edi movl $_TIF_WORK_MASK,%edi
andl %edi,%edx andl %edi,%edx
jnz retint_careful jnz retint_careful
jmp retint_swapgs jmp retint_swapgs
CFI_ENDPROC CFI_ENDPROC
KPROBE_END(error_exit)
error_kernelspace:
incl %ebx
/* There are two places in the kernel that can potentially fault with
usergs. Handle them here. The exception handlers after
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
leaq irq_return(%rip),%rcx
cmpq %rcx,RIP(%rsp)
je error_swapgs
movl %ecx,%ecx /* zero extend */
cmpq %rcx,RIP(%rsp)
je error_swapgs
cmpq $gs_change,RIP(%rsp)
je error_swapgs
jmp error_sti
KPROBE_END(error_entry)
/* Reload gs selector with exception handling */ /* Reload gs selector with exception handling */
/* edi: new selector */ /* edi: new selector */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment