Commit 6d084197 authored by Nick Piggin's avatar Nick Piggin Committed by Pekka Enberg

SLQB slab allocator

Introducing the SLQB slab allocator.

SLQB takes code and ideas from all other slab allocators in the tree.

The primary method for keeping lists of free objects within the allocator
is a singly-linked list, storing a pointer within the object memory itself
(or a small additional space in the case of RCU destroyed slabs). This is
like SLOB and SLUB, and opposed to SLAB, which uses arrays of objects, and
metadata. This reduces memory consumption and makes smaller sized objects
more realistic as there is less overhead.

Using lists rather than arrays can reduce the cacheline footprint. When moving
objects around, SLQB can move a list of objects from one CPU to another by
simply manipulating a head pointer, wheras SLAB needs to memcpy arrays. Some
SLAB per-CPU arrays can be up to 1K in size, which is a lot of cachelines that
can be touched during alloc/free. Newly freed objects tend to be cache hot,
and newly allocated ones tend to soon be touched anyway, so often there is
little cost to using metadata in the objects.

SLQB has a per-CPU LIFO freelist of objects like SLAB (but using lists rather
than arrays). Freed objects are returned to this freelist if they belong to
the node which our CPU belongs to. So objects allocated on one CPU can be
added to the freelist of another CPU on the same node. When LIFO freelists need
to be refilled or trimmed, SLQB takes or returns objects from a list of slabs.

SLQB has per-CPU lists of slabs (which use struct page as their metadata
including list head for this list). Each slab contains a singly-linked list of
objects that are free in that slab (free, and not on a LIFO freelist). Slabs
are freed as soon as all their objects are freed, and only allocated when there
are no slabs remaining. They are taken off this slab list when if there are no
free objects left. So the slab lists always only contain "partial" slabs; those
slabs which are not completely full and not completely empty. SLQB slabs can be
manipulated with no locking unlike other allocators which tend to use per-node
locks. As the number of threads per socket increases, this should help improve
the scalability of slab operations.

Freeing objects to remote slab lists first batches up the objects on the
freeing CPU, then moves them over at once to a list on the allocating CPU. The
allocating CPU will then notice those objects and pull them onto the end of its
freelist.  This remote freeing scheme is designed to minimise the number of
cross CPU cachelines touched, short of going to a "crossbar" arrangement like
SLAB has.  SLAB has "crossbars" of arrays of objects. That is,
NR_CPUS*MAX_NUMNODES type arrays, which can become very bloated in huge systems
(this could be hundreds of GBs for kmem caches for 4096 CPU, 1024 nodes
systems).

SLQB also has similar freelist, slablist structures per-node, which are
protected by a lock, and usable by any CPU in order to do node specific
allocations. These allocations tend not to be too frequent (short lived
allocations should be node local, long lived allocations should not be
too frequent).

There is a good overview and illustration of the design here:

http://lwn.net/Articles/311502/

By using LIFO freelists like SLAB, SLQB tries to be very page-size agnostic.
It tries very hard to use order-0 pages. This is good for both page allocator
fragmentation, and slab fragmentation.

SLQB initialistaion code attempts to be as simple and un-clever as possible.
There are no multiple phases where different things come up. There is no
weird self bootstrapping stuff. It just statically allocates the structures
required to create the slabs that allocate other slab structures.

SLQB uses much of the debugging infrastructure, and fine-grained sysfs
statistics from SLUB. There is also a Documentation/vm/slqbinfo.c, derived
from slabinfo.c, which can query the sysfs data.
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
Signed-off-by: default avatarPekka Enberg <penberg@cs.helsinki.fi>
parent 0cc6d77e
This diff is collapsed.
......@@ -49,6 +49,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
* virt_addr_valid(kaddr) returns true.
*/
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define virt_to_page_fast(kaddr) pfn_to_page(((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT)
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
extern bool __virt_addr_valid(unsigned long kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
......
......@@ -305,7 +305,11 @@ static inline void get_page(struct page *page)
static inline struct page *virt_to_head_page(const void *x)
{
#ifdef virt_to_page_fast
struct page *page = virt_to_page_fast(x);
#else
struct page *page = virt_to_page(x);
#endif
return compound_head(page);
}
......
#ifndef __LINUX_RCU_TYPES_H
#define __LINUX_RCU_TYPES_H
#ifdef __KERNEL__
/**
* struct rcu_head - callback structure for use with RCU
* @next: next update requests in a list
* @func: actual update function to call after the grace period.
*/
struct rcu_head {
struct rcu_head *next;
void (*func)(struct rcu_head *head);
};
#endif
#endif
......@@ -33,6 +33,7 @@
#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H
#include <linux/rcu_types.h>
#include <linux/cache.h>
#include <linux/spinlock.h>
#include <linux/threads.h>
......@@ -41,16 +42,6 @@
#include <linux/lockdep.h>
#include <linux/completion.h>
/**
* struct rcu_head - callback structure for use with RCU
* @next: next update requests in a list
* @func: actual update function to call after the grace period.
*/
struct rcu_head {
struct rcu_head *next;
void (*func)(struct rcu_head *head);
};
/* Exported common interfaces */
extern void synchronize_rcu(void);
extern void synchronize_rcu_bh(void);
......
......@@ -74,6 +74,10 @@
/* The following flags affect the page allocator grouping pages by mobility */
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
/* Following flags should only be used by allocator specific flags */
#define SLAB_ALLOC_PRIVATE 0x000000ffUL
/*
* ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
*
......@@ -160,6 +164,8 @@ size_t ksize(const void *);
*/
#ifdef CONFIG_SLUB
#include <linux/slub_def.h>
#elif defined(CONFIG_SLQB)
#include <linux/slqb_def.h>
#elif defined(CONFIG_SLOB)
#include <linux/slob_def.h>
#else
......@@ -262,7 +268,7 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
* allocator where we care about the real place the memory allocation
* request comes from.
*/
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || defined(CONFIG_SLQB_DEBUG)
extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
#define kmalloc_track_caller(size, flags) \
__kmalloc_track_caller(size, flags, _RET_IP_)
......@@ -280,7 +286,7 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
* standard allocator where we care about the real place the memory
* allocation request comes from.
*/
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || defined(CONFIG_SLQB_DEBUG)
extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
#define kmalloc_node_track_caller(size, flags, node) \
__kmalloc_node_track_caller(size, flags, node, \
......
#ifndef _LINUX_SLQB_DEF_H
#define _LINUX_SLQB_DEF_H
/*
* SLQB : A slab allocator with object queues.
*
* (C) 2008 Nick Piggin <npiggin@suse.de>
*/
#include <linux/types.h>
#include <linux/gfp.h>
#include <linux/workqueue.h>
#include <linux/kobject.h>
#include <linux/rcu_types.h>
#include <linux/mm_types.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
#define SLAB_NUMA 0x00000001UL /* shortcut */
enum stat_item {
ALLOC, /* Allocation count */
ALLOC_SLAB_FILL, /* Fill freelist from page list */
ALLOC_SLAB_NEW, /* New slab acquired from page allocator */
FREE, /* Free count */
FREE_REMOTE, /* NUMA: freeing to remote list */
FLUSH_FREE_LIST, /* Freelist flushed */
FLUSH_FREE_LIST_OBJECTS, /* Objects flushed from freelist */
FLUSH_FREE_LIST_REMOTE, /* Objects flushed from freelist to remote */
FLUSH_SLAB_PARTIAL, /* Freeing moves slab to partial list */
FLUSH_SLAB_FREE, /* Slab freed to the page allocator */
FLUSH_RFREE_LIST, /* Rfree list flushed */
FLUSH_RFREE_LIST_OBJECTS, /* Rfree objects flushed */
CLAIM_REMOTE_LIST, /* Remote freed list claimed */
CLAIM_REMOTE_LIST_OBJECTS, /* Remote freed objects claimed */
NR_SLQB_STAT_ITEMS
};
/*
* Singly-linked list with head, tail, and nr
*/
struct kmlist {
unsigned long nr;
void **head;
void **tail;
};
/*
* Every kmem_cache_list has a kmem_cache_remote_free structure, by which
* objects can be returned to the kmem_cache_list from remote CPUs.
*/
struct kmem_cache_remote_free {
spinlock_t lock;
struct kmlist list;
} ____cacheline_aligned;
/*
* A kmem_cache_list manages all the slabs and objects allocated from a given
* source. Per-cpu kmem_cache_lists allow node-local allocations. Per-node
* kmem_cache_lists allow off-node allocations (but require locking).
*/
struct kmem_cache_list {
/* Fastpath LIFO freelist of objects */
struct kmlist freelist;
#ifdef CONFIG_SMP
/* remote_free has reached a watermark */
int remote_free_check;
#endif
/* kmem_cache corresponding to this list */
struct kmem_cache *cache;
/* Number of partial slabs (pages) */
unsigned long nr_partial;
/* Slabs which have some free objects */
struct list_head partial;
/* Total number of slabs allocated */
unsigned long nr_slabs;
#ifdef CONFIG_SMP
/*
* In the case of per-cpu lists, remote_free is for objects freed by
* non-owner CPU back to its home list. For per-node lists, remote_free
* is always used to free objects.
*/
struct kmem_cache_remote_free remote_free;
#endif
#ifdef CONFIG_SLQB_STATS
unsigned long stats[NR_SLQB_STAT_ITEMS];
#endif
} ____cacheline_aligned;
/*
* Primary per-cpu, per-kmem_cache structure.
*/
struct kmem_cache_cpu {
struct kmem_cache_list list; /* List for node-local slabs */
unsigned int colour_next; /* Next colour offset to use */
#ifdef CONFIG_SMP
/*
* rlist is a list of objects that don't fit on list.freelist (ie.
* wrong node). The objects all correspond to a given kmem_cache_list,
* remote_cache_list. To free objects to another list, we must first
* flush the existing objects, then switch remote_cache_list.
*
* An NR_CPUS or MAX_NUMNODES array would be nice here, but then we
* get to O(NR_CPUS^2) memory consumption situation.
*/
struct kmlist rlist;
struct kmem_cache_list *remote_cache_list;
#endif
} ____cacheline_aligned;
/*
* Per-node, per-kmem_cache structure. Used for node-specific allocations.
*/
struct kmem_cache_node {
struct kmem_cache_list list;
spinlock_t list_lock; /* protects access to list */
} ____cacheline_aligned;
/*
* Management object for a slab cache.
*/
struct kmem_cache {
unsigned long flags;
int hiwater; /* LIFO list high watermark */
int freebatch; /* LIFO freelist batch flush size */
int objsize; /* Size of object without meta data */
int offset; /* Free pointer offset. */
int objects; /* Number of objects in slab */
int size; /* Size of object including meta data */
int order; /* Allocation order */
gfp_t allocflags; /* gfp flags to use on allocation */
unsigned int colour_range; /* range of colour counter */
unsigned int colour_off; /* offset per colour */
void (*ctor)(void *);
const char *name; /* Name (only for display!) */
struct list_head list; /* List of slab caches */
int align; /* Alignment */
int inuse; /* Offset to metadata */
#ifdef CONFIG_SLQB_SYSFS
struct kobject kobj; /* For sysfs */
#endif
#ifdef CONFIG_NUMA
struct kmem_cache_node *node[MAX_NUMNODES];
#endif
#ifdef CONFIG_SMP
struct kmem_cache_cpu *cpu_slab[NR_CPUS];
#else
struct kmem_cache_cpu cpu_slab;
#endif
};
/*
* Kmalloc subsystem.
*/
#if defined(ARCH_KMALLOC_MINALIGN) && ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#else
#define KMALLOC_MIN_SIZE 8
#endif
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#define KMALLOC_SHIFT_SLQB_HIGH (PAGE_SHIFT + 9)
extern struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1];
extern struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1];
/*
* Constant size allocations use this path to find index into kmalloc caches
* arrays. get_slab() function is used for non-constant sizes.
*/
static __always_inline int kmalloc_index(size_t size)
{
if (unlikely(!size))
return 0;
if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
return 0;
if (unlikely(size <= KMALLOC_MIN_SIZE))
return KMALLOC_SHIFT_LOW;
#if L1_CACHE_BYTES < 64
if (size > 64 && size <= 96)
return 1;
#endif
#if L1_CACHE_BYTES < 128
if (size > 128 && size <= 192)
return 2;
#endif
if (size <= 8) return 3;
if (size <= 16) return 4;
if (size <= 32) return 5;
if (size <= 64) return 6;
if (size <= 128) return 7;
if (size <= 256) return 8;
if (size <= 512) return 9;
if (size <= 1024) return 10;
if (size <= 2 * 1024) return 11;
if (size <= 4 * 1024) return 12;
if (size <= 8 * 1024) return 13;
if (size <= 16 * 1024) return 14;
if (size <= 32 * 1024) return 15;
if (size <= 64 * 1024) return 16;
if (size <= 128 * 1024) return 17;
if (size <= 256 * 1024) return 18;
if (size <= 512 * 1024) return 19;
if (size <= 1024 * 1024) return 20;
if (size <= 2 * 1024 * 1024) return 21;
return -1;
}
#ifdef CONFIG_ZONE_DMA
#define SLQB_DMA __GFP_DMA
#else
/* Disable "DMA slabs" */
#define SLQB_DMA (__force gfp_t)0
#endif
/*
* Find the kmalloc slab cache for a given combination of allocation flags and
* size. Should really only be used for constant 'size' arguments, due to
* bloat.
*/
static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
{
int index;
BUILD_BUG_ON(!__builtin_constant_p(size));
index = kmalloc_index(size);
if (unlikely(index == 0))
return NULL;
if (likely(!(flags & SLQB_DMA)))
return &kmalloc_caches[index];
else
return &kmalloc_caches_dma[index];
}
void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
void *__kmalloc(size_t size, gfp_t flags);
#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#endif
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
#define KMALLOC_HEADER (ARCH_KMALLOC_MINALIGN < sizeof(void *) ? \
sizeof(void *) : ARCH_KMALLOC_MINALIGN)
static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
struct kmem_cache *s;
s = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
return kmem_cache_alloc(s, flags);
}
return __kmalloc(size, flags);
}
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
{
if (__builtin_constant_p(size)) {
struct kmem_cache *s;
s = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
return kmem_cache_alloc_node(s, flags, node);
}
return __kmalloc_node(size, flags, node);
}
#endif
#endif /* _LINUX_SLQB_DEF_H */
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment