Commit acc20761 authored by Chuck Ebbert's avatar Chuck Ebbert Committed by Andi Kleen

[PATCH] i386: add sleazy FPU optimization

i386 port of the sLeAZY-fpu feature.  Chuck reports that this gives him a +/-
0.4% improvement on his simple benchmark

x86_64 description follows:

Right now the kernel on x86-64 has a 100% lazy fpu behavior: after *every*
context switch a trap is taken for the first FPU use to restore the FPU
context lazily.  This is of course great for applications that have very
sporadic or no FPU use (since then you avoid doing the expensive save/restore
all the time).  However for very frequent FPU users...  you take an extra trap
every context switch.

The patch below adds a simple heuristic to this code: After 5 consecutive
context switches of FPU use, the lazy behavior is disabled and the context
gets restored every context switch.  If the app indeed uses the FPU, the trap
is avoided.  (the chance of the 6th time slice using FPU after the previous 5
having done so are quite high obviously).

After 256 switches, this is reset and lazy behavior is returned (until there
are 5 consecutive ones again).  The reason for this is to give apps that do
longer bursts of FPU use still the lazy behavior back after some time.
Signed-off-by: default avatarChuck Ebbert <76306.1226@compuserve.com>
Signed-off-by: default avatarArjan van de Ven <arjan@linux.intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarAndi Kleen <ak@suse.de>
parent be44d2aa
...@@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas ...@@ -648,6 +648,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
__unlazy_fpu(prev_p); __unlazy_fpu(prev_p);
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter > 5)
prefetch(&next->i387.fxsave);
/* /*
* Reload esp0. * Reload esp0.
*/ */
...@@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas ...@@ -697,6 +702,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
disable_tsc(prev_p, next_p); disable_tsc(prev_p, next_p);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
*/
if (next_p->fpu_counter > 5)
math_state_restore();
return prev_p; return prev_p;
} }
......
...@@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp, ...@@ -1118,7 +1118,7 @@ fastcall unsigned long patch_espfix_desc(unsigned long uesp,
* Must be called with kernel preemption disabled (in this case, * Must be called with kernel preemption disabled (in this case,
* local interrupts are disabled at the call-site in entry.S). * local interrupts are disabled at the call-site in entry.S).
*/ */
asmlinkage void math_state_restore(struct pt_regs regs) asmlinkage void math_state_restore(void)
{ {
struct thread_info *thread = current_thread_info(); struct thread_info *thread = current_thread_info();
struct task_struct *tsk = thread->task; struct task_struct *tsk = thread->task;
...@@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs) ...@@ -1128,6 +1128,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
init_fpu(tsk); init_fpu(tsk);
restore_fpu(tsk); restore_fpu(tsk);
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
} }
#ifndef CONFIG_MATH_EMULATION #ifndef CONFIG_MATH_EMULATION
......
...@@ -77,6 +77,8 @@ static inline void __save_init_fpu( struct task_struct *tsk ) ...@@ -77,6 +77,8 @@ static inline void __save_init_fpu( struct task_struct *tsk )
#define __unlazy_fpu( tsk ) do { \ #define __unlazy_fpu( tsk ) do { \
if (task_thread_info(tsk)->status & TS_USEDFPU) \ if (task_thread_info(tsk)->status & TS_USEDFPU) \
save_init_fpu( tsk ); \ save_init_fpu( tsk ); \
else \
tsk->fpu_counter = 0; \
} while (0) } while (0)
#define __clear_fpu( tsk ) \ #define __clear_fpu( tsk ) \
...@@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk ) ...@@ -118,6 +120,7 @@ static inline void save_init_fpu( struct task_struct *tsk )
extern unsigned short get_fpu_cwd( struct task_struct *tsk ); extern unsigned short get_fpu_cwd( struct task_struct *tsk );
extern unsigned short get_fpu_swd( struct task_struct *tsk ); extern unsigned short get_fpu_swd( struct task_struct *tsk );
extern unsigned short get_fpu_mxcsr( struct task_struct *tsk ); extern unsigned short get_fpu_mxcsr( struct task_struct *tsk );
extern asmlinkage void math_state_restore(void);
/* /*
* Signal frame handlers... * Signal frame handlers...
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment