Commit 0793a61d authored by Thomas Gleixner's avatar Thomas Gleixner Committed by Ingo Molnar

performance counters: core code

Implement the core kernel bits of Performance Counters subsystem.

The Linux Performance Counter subsystem provides an abstraction of
performance counter hardware capabilities. It provides per task and per
CPU counters, and it provides event capabilities on top of those.

Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.

The special file descriptor is opened via the perf_counter_open()
system call:

 int
 perf_counter_open(u32 hw_event_type,
                   u32 hw_event_period,
                   u32 record_type,
                   pid_t pid,
                   int cpu);

The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
can be used to set the blocking mode, etc.

Multiple counters can be kept open at a time, and the counters
can be poll()ed.

See more details in Documentation/perf-counters.txt.
Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent b5aa97e8
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <linux/kbd_kern.h> #include <linux/kbd_kern.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/quotaops.h> #include <linux/quotaops.h>
#include <linux/perf_counter.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/suspend.h> #include <linux/suspend.h>
...@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) ...@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
struct pt_regs *regs = get_irq_regs(); struct pt_regs *regs = get_irq_regs();
if (regs) if (regs)
show_regs(regs); show_regs(regs);
perf_counter_print_debug();
} }
static struct sysrq_key_op sysrq_showregs_op = { static struct sysrq_key_op sysrq_showregs_op = {
.handler = sysrq_handle_showregs, .handler = sysrq_handle_showregs,
......
/*
* Performance counters:
*
* Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
* Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
*
* Data type definitions, declarations, prototypes.
*
* Started by: Thomas Gleixner and Ingo Molnar
*
* For licencing details see kernel-base/COPYING
*/
#ifndef _LINUX_PERF_COUNTER_H
#define _LINUX_PERF_COUNTER_H
#include <asm/atomic.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
struct task_struct;
/*
* Generalized hardware event types, used by the hw_event_type parameter
* of the sys_perf_counter_open() syscall:
*/
enum hw_event_types {
PERF_COUNT_CYCLES,
PERF_COUNT_INSTRUCTIONS,
PERF_COUNT_CACHE_REFERENCES,
PERF_COUNT_CACHE_MISSES,
PERF_COUNT_BRANCH_INSTRUCTIONS,
PERF_COUNT_BRANCH_MISSES,
/*
* If this bit is set in the type, then trigger NMI sampling:
*/
PERF_COUNT_NMI = (1 << 30),
};
/*
* IRQ-notification data record type:
*/
enum perf_record_type {
PERF_RECORD_SIMPLE,
PERF_RECORD_IRQ,
PERF_RECORD_GROUP,
};
/**
* struct hw_perf_counter - performance counter hardware details
*/
struct hw_perf_counter {
u64 config;
unsigned long config_base;
unsigned long counter_base;
int nmi;
unsigned int idx;
u64 prev_count;
s32 next_count;
u64 irq_period;
};
/*
* Hardcoded buffer length limit for now, for IRQ-fed events:
*/
#define PERF_DATA_BUFLEN 2048
/**
* struct perf_data - performance counter IRQ data sampling ...
*/
struct perf_data {
int len;
int rd_idx;
int overrun;
u8 data[PERF_DATA_BUFLEN];
};
/**
* struct perf_counter - performance counter kernel representation:
*/
struct perf_counter {
struct list_head list;
int active;
#if BITS_PER_LONG == 64
atomic64_t count;
#else
atomic_t count32[2];
#endif
u64 __irq_period;
struct hw_perf_counter hw;
struct perf_counter_context *ctx;
struct task_struct *task;
/*
* Protect attach/detach:
*/
struct mutex mutex;
int oncpu;
int cpu;
s32 hw_event_type;
enum perf_record_type record_type;
/* read() / irq related data */
wait_queue_head_t waitq;
/* optional: for NMIs */
int wakeup_pending;
struct perf_data *irqdata;
struct perf_data *usrdata;
struct perf_data data[2];
};
/**
* struct perf_counter_context - counter context structure
*
* Used as a container for task counters and CPU counters as well:
*/
struct perf_counter_context {
#ifdef CONFIG_PERF_COUNTERS
/*
* Protect the list of counters:
*/
spinlock_t lock;
struct list_head counters;
int nr_counters;
int nr_active;
struct task_struct *task;
#endif
};
/**
* struct perf_counter_cpu_context - per cpu counter context structure
*/
struct perf_cpu_context {
struct perf_counter_context ctx;
struct perf_counter_context *task_ctx;
int active_oncpu;
int max_pertask;
};
/*
* Set by architecture code:
*/
extern int perf_max_counters;
#ifdef CONFIG_PERF_COUNTERS
extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
extern void perf_counter_task_tick(struct task_struct *task, int cpu);
extern void perf_counter_init_task(struct task_struct *task);
extern void perf_counter_notify(struct pt_regs *regs);
extern void perf_counter_print_debug(void);
#else
static inline void
perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
static inline void
perf_counter_task_tick(struct task_struct *task, int cpu) { }
static inline void perf_counter_init_task(struct task_struct *task) { }
static inline void perf_counter_notify(struct pt_regs *regs) { }
static inline void perf_counter_print_debug(void) { }
#endif
#endif /* _LINUX_PERF_COUNTER_H */
...@@ -71,6 +71,7 @@ struct sched_param { ...@@ -71,6 +71,7 @@ struct sched_param {
#include <linux/fs_struct.h> #include <linux/fs_struct.h>
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/perf_counter.h>
#include <linux/pid.h> #include <linux/pid.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/topology.h> #include <linux/topology.h>
...@@ -1326,6 +1327,7 @@ struct task_struct { ...@@ -1326,6 +1327,7 @@ struct task_struct {
struct list_head pi_state_list; struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache; struct futex_pi_state *pi_state_cache;
#endif #endif
struct perf_counter_context perf_counter_ctx;
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
struct mempolicy *mempolicy; struct mempolicy *mempolicy;
short il_next; short il_next;
...@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) ...@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
#define TASK_SIZE_OF(tsk) TASK_SIZE #define TASK_SIZE_OF(tsk) TASK_SIZE
#endif #endif
/*
* Call the function if the target task is executing on a CPU right now:
*/
extern void task_oncpu_function_call(struct task_struct *p,
void (*func) (void *info), void *info);
#ifdef CONFIG_MM_OWNER #ifdef CONFIG_MM_OWNER
extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_update_next_owner(struct mm_struct *mm);
extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
......
...@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); ...@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
int kernel_execve(const char *filename, char *const argv[], char *const envp[]); int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
asmlinkage int
sys_perf_counter_open(u32 hw_event_type,
u32 hw_event_period,
u32 record_type,
pid_t pid,
int cpu);
#endif #endif
...@@ -732,6 +732,35 @@ config AIO ...@@ -732,6 +732,35 @@ config AIO
by some high performance threaded applications. Disabling by some high performance threaded applications. Disabling
this option saves about 7k. this option saves about 7k.
config HAVE_PERF_COUNTERS
bool
menu "Performance Counters"
config PERF_COUNTERS
bool "Kernel Performance Counters"
depends on HAVE_PERF_COUNTERS
default y
help
Enable kernel support for performance counter hardware.
Performance counters are special hardware registers available
on most modern CPUs. These registers count the number of certain
types of hw events: such as instructions executed, cachemisses
suffered, or branches mis-predicted - without slowing down the
kernel or applications. These registers can also trigger interrupts
when a threshold number of events have passed - and can thus be
used to profile the code that runs on that CPU.
The Linux Performance Counter subsystem provides an abstraction of
these hardware capabilities, available via a system call. It
provides per task and per CPU counters, and it provides event
capabilities on top of those.
Say Y if unsure.
endmenu
config VM_EVENT_COUNTERS config VM_EVENT_COUNTERS
default y default y
bool "Enable VM event counters for /proc/vmstat" if EMBEDDED bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
......
...@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o ...@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
......
...@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, ...@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out; goto fork_out;
rt_mutex_init_task(p); rt_mutex_init_task(p);
perf_counter_init_task(p);
#ifdef CONFIG_PROVE_LOCKING #ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
......
This diff is collapsed.
...@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag) ...@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag)
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
/**
* task_oncpu_function_call - call a function on the cpu on which a task runs
* @p: the task to evaluate
* @func: the function to be called
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
* be on the current CPU, which just calls the function directly
*/
void task_oncpu_function_call(struct task_struct *p,
void (*func) (void *info), void *info)
{
int cpu;
preempt_disable();
cpu = task_cpu(p);
if (task_curr(p))
smp_call_function_single(cpu, func, info, 1);
preempt_enable();
}
/*** /***
* try_to_wake_up - wake up a thread * try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread * @p: the to-be-woken-up thread
...@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, ...@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next) struct task_struct *next)
{ {
fire_sched_out_preempt_notifiers(prev, next); fire_sched_out_preempt_notifiers(prev, next);
perf_counter_task_sched_out(prev, cpu_of(rq));
prepare_lock_switch(rq, next); prepare_lock_switch(rq, next);
prepare_arch_switch(next); prepare_arch_switch(next);
} }
...@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) ...@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/ */
prev_state = prev->state; prev_state = prev->state;
finish_arch_switch(prev); finish_arch_switch(prev);
perf_counter_task_sched_in(current, cpu_of(rq));
finish_lock_switch(rq, prev); finish_lock_switch(rq, prev);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (current->sched_class->post_schedule) if (current->sched_class->post_schedule)
...@@ -4296,6 +4319,7 @@ void scheduler_tick(void) ...@@ -4296,6 +4319,7 @@ void scheduler_tick(void)
rq->idle_at_tick = idle_cpu(cpu); rq->idle_at_tick = idle_cpu(cpu);
trigger_load_balance(rq, cpu); trigger_load_balance(rq, cpu);
#endif #endif
perf_counter_task_tick(curr, cpu);
} }
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
......
...@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime); ...@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime); cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd); cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2); cond_syscall(sys_eventfd2);
/* performance counters: */
cond_syscall(sys_perf_counter_open);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment