SLQB slab allocator

Introducing the SLQB slab allocator. SLQB takes code and ideas from all other slab allocators in the tree. The primary method for keeping lists of free objects within the allocator is a singly-linked list, storing a pointer within the object memory itself (or a small additional space in the case of RCU destroyed slabs). This is like SLOB and SLUB, and opposed to SLAB, which uses arrays of objects, and metadata. This reduces memory consumption and makes smaller sized objects more realistic as there is less overhead. Using lists rather than arrays can reduce the cacheline footprint. When moving objects around, SLQB can move a list of objects from one CPU to another by simply manipulating a head pointer, wheras SLAB needs to memcpy arrays. Some SLAB per-CPU arrays can be up to 1K in size, which is a lot of cachelines that can be touched during alloc/free. Newly freed objects tend to be cache hot, and newly allocated ones tend to soon be touched anyway, so often there is little cost to using metadata in the objects. SLQB has a per-CPU LIFO freelist of objects like SLAB (but using lists rather than arrays). Freed objects are returned to this freelist if they belong to the node which our CPU belongs to. So objects allocated on one CPU can be added to the freelist of another CPU on the same node. When LIFO freelists need to be refilled or trimmed, SLQB takes or returns objects from a list of slabs. SLQB has per-CPU lists of slabs (which use struct page as their metadata including list head for this list). Each slab contains a singly-linked list of objects that are free in that slab (free, and not on a LIFO freelist). Slabs are freed as soon as all their objects are freed, and only allocated when there are no slabs remaining. They are taken off this slab list when if there are no free objects left. So the slab lists always only contain "partial" slabs; those slabs which are not completely full and not completely empty. SLQB slabs can be manipulated with no locking unlike other allocators which tend to use per-node locks. As the number of threads per socket increases, this should help improve the scalability of slab operations. Freeing objects to remote slab lists first batches up the objects on the freeing CPU, then moves them over at once to a list on the allocating CPU. The allocating CPU will then notice those objects and pull them onto the end of its freelist. This remote freeing scheme is designed to minimise the number of cross CPU cachelines touched, short of going to a "crossbar" arrangement like SLAB has. SLAB has "crossbars" of arrays of objects. That is, NR_CPUS*MAX_NUMNODES type arrays, which can become very bloated in huge systems (this could be hundreds of GBs for kmem caches for 4096 CPU, 1024 nodes systems). SLQB also has similar freelist, slablist structures per-node, which are protected by a lock, and usable by any CPU in order to do node specific allocations. These allocations tend not to be too frequent (short lived allocations should be node local, long lived allocations should not be too frequent). There is a good overview and illustration of the design here: http://lwn.net/Articles/311502/ By using LIFO freelists like SLAB, SLQB tries to be very page-size agnostic. It tries very hard to use order-0 pages. This is good for both page allocator fragmentation, and slab fragmentation. SLQB initialistaion code attempts to be as simple and un-clever as possible. There are no multiple phases where different things come up. There is no weird self bootstrapping stuff. It just statically allocates the structures required to create the slabs that allocate other slab structures. SLQB uses much of the debugging infrastructure, and fine-grained sysfs statistics from SLUB. There is also a Documentation/vm/slqbinfo.c, derived from slabinfo.c, which can query the sysfs data. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>

SLQB slab allocator
Introducing the SLQB slab allocator. SLQB takes code and ideas from all other slab allocators in the tree. The primary method for keeping lists of free objects within the allocator is a singly-linked list, storing a pointer within the object memory itself (or a small additional space in the case of RCU destroyed slabs). This is like SLOB and SLUB, and opposed to SLAB, which uses arrays of objects, and metadata. This reduces memory consumption and makes smaller sized objects more realistic as there is less overhead. Using lists rather than arrays can reduce the cacheline footprint. When moving objects around, SLQB can move a list of objects from one CPU to another by simply manipulating a head pointer, wheras SLAB needs to memcpy arrays. Some SLAB per-CPU arrays can be up to 1K in size, which is a lot of cachelines that can be touched during alloc/free. Newly freed objects tend to be cache hot, and newly allocated ones tend to soon be touched anyway, so often there is little cost to using metadata in the objects. SLQB has a per-CPU LIFO freelist of objects like SLAB (but using lists rather than arrays). Freed objects are returned to this freelist if they belong to the node which our CPU belongs to. So objects allocated on one CPU can be added to the freelist of another CPU on the same node. When LIFO freelists need to be refilled or trimmed, SLQB takes or returns objects from a list of slabs. SLQB has per-CPU lists of slabs (which use struct page as their metadata including list head for this list). Each slab contains a singly-linked list of objects that are free in that slab (free, and not on a LIFO freelist). Slabs are freed as soon as all their objects are freed, and only allocated when there are no slabs remaining. They are taken off this slab list when if there are no free objects left. So the slab lists always only contain "partial" slabs; those slabs which are not completely full and not completely empty. SLQB slabs can be manipulated with no locking unlike other allocators which tend to use per-node locks. As the number of threads per socket increases, this should help improve the scalability of slab operations. Freeing objects to remote slab lists first batches up the objects on the freeing CPU, then moves them over at once to a list on the allocating CPU. The allocating CPU will then notice those objects and pull them onto the end of its freelist. This remote freeing scheme is designed to minimise the number of cross CPU cachelines touched, short of going to a "crossbar" arrangement like SLAB has. SLAB has "crossbars" of arrays of objects. That is, NR_CPUS*MAX_NUMNODES type arrays, which can become very bloated in huge systems (this could be hundreds of GBs for kmem caches for 4096 CPU, 1024 nodes systems). SLQB also has similar freelist, slablist structures per-node, which are protected by a lock, and usable by any CPU in order to do node specific allocations. These allocations tend not to be too frequent (short lived allocations should be node local, long lived allocations should not be too frequent). There is a good overview and illustration of the design here: http://lwn.net/Articles/311502/ By using LIFO freelists like SLAB, SLQB tries to be very page-size agnostic. It tries very hard to use order-0 pages. This is good for both page allocator fragmentation, and slab fragmentation. SLQB initialistaion code attempts to be as simple and un-clever as possible. There are no multiple phases where different things come up. There is no weird self bootstrapping stuff. It just statically allocates the structures required to create the slabs that allocate other slab structures. SLQB uses much of the debugging infrastructure, and fine-grained sysfs statistics from SLUB. There is also a Documentation/vm/slqbinfo.c, derived from slabinfo.c, which can query the sysfs data. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
6d084197 · Nick Piggin · Pekka Enberg · 0cc6d77e · 6d084197 · 6d084197
Commit 6d084197 authored Jan 23, 2009 by Nick Piggin Committed by Pekka Enberg Sep 14, 2009
8 changed files
--- a/Documentation/vm/slqbinfo.c
+++ b/Documentation/vm/slqbinfo.c
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -49,6 +49,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 * virt_addr_valid(kaddr) returns true.
 */
 #define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+#define virt_to_page_fast(kaddr) pfn_to_page(((unsigned long)(kaddr) - PAGE_OFFSET) >> PAGE_SHIFT)
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
 extern bool __virt_addr_valid(unsigned long kaddr);
 #define virt_addr_valid(kaddr)	__virt_addr_valid((unsigned long) (kaddr))

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -305,7 +305,11 @@ static inline void get_page(struct page *page)

 static inline struct page *virt_to_head_page(const void *x)
 {
+#ifdef virt_to_page_fast
+	struct page *page = virt_to_page_fast(x);
+#else
 	struct page *page = virt_to_page(x);
+#endif
 	return compound_head(page);
 }


--- a/include/linux/rcu_types.h
+++ b/include/linux/rcu_types.h
+#ifndef __LINUX_RCU_TYPES_H
+#define __LINUX_RCU_TYPES_H
+
+#ifdef __KERNEL__
+
+/**
+ * struct rcu_head - callback structure for use with RCU
+ * @next: next update requests in a list
+ * @func: actual update function to call after the grace period.
+ */
+struct rcu_head {
+	struct rcu_head *next;
+	void (*func)(struct rcu_head *head);
+};
+
+#endif
+
+#endif
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -33,6 +33,7 @@
 #ifndef __LINUX_RCUPDATE_H
 #define __LINUX_RCUPDATE_H

+#include <linux/rcu_types.h>
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
@@ -41,16 +42,6 @@
 #include <linux/lockdep.h>
 #include <linux/completion.h>

-/**
- * struct rcu_head - callback structure for use with RCU
- * @next: next update requests in a list
- * @func: actual update function to call after the grace period.
- */
-struct rcu_head {
-	struct rcu_head *next;
-	void (*func)(struct rcu_head *head);
-};
-
 /* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void synchronize_rcu_bh(void);

--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -74,6 +74,10 @@
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
+
+/* Following flags should only be used by allocator specific flags */
+#define SLAB_ALLOC_PRIVATE	0x000000ffUL
+
 /*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
@@ -160,6 +164,8 @@ size_t ksize(const void *);
 */
 #ifdef CONFIG_SLUB
 #include <linux/slub_def.h>
+#elif defined(CONFIG_SLQB)
+#include <linux/slqb_def.h>
 #elif defined(CONFIG_SLOB)
 #include <linux/slob_def.h>
 #else
@@ -262,7 +268,7 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || defined(CONFIG_SLQB_DEBUG)
 extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 #define kmalloc_track_caller(size, flags) \
 	__kmalloc_track_caller(size, flags, _RET_IP_)
@@ -280,7 +286,7 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
 * standard allocator where we care about the real place the memory
 * allocation request comes from.
 */
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || defined(CONFIG_SLQB_DEBUG)
 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
 #define kmalloc_node_track_caller(size, flags, node) \
 	__kmalloc_node_track_caller(size, flags, node, \

--- a/include/linux/slqb_def.h
+++ b/include/linux/slqb_def.h
+#ifndef _LINUX_SLQB_DEF_H
+#define _LINUX_SLQB_DEF_H
+
+/*
+ * SLQB : A slab allocator with object queues.
+ *
+ * (C) 2008 Nick Piggin <npiggin@suse.de>
+ */
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/workqueue.h>
+#include <linux/kobject.h>
+#include <linux/rcu_types.h>
+#include <linux/mm_types.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+
+#define SLAB_NUMA		0x00000001UL    /* shortcut */
+
+enum stat_item {
+	ALLOC,			/* Allocation count */
+	ALLOC_SLAB_FILL,	/* Fill freelist from page list */
+	ALLOC_SLAB_NEW,		/* New slab acquired from page allocator */
+	FREE,			/* Free count */
+	FREE_REMOTE,		/* NUMA: freeing to remote list */
+	FLUSH_FREE_LIST,	/* Freelist flushed */
+	FLUSH_FREE_LIST_OBJECTS, /* Objects flushed from freelist */
+	FLUSH_FREE_LIST_REMOTE,	/* Objects flushed from freelist to remote */
+	FLUSH_SLAB_PARTIAL,	/* Freeing moves slab to partial list */
+	FLUSH_SLAB_FREE,	/* Slab freed to the page allocator */
+	FLUSH_RFREE_LIST,	/* Rfree list flushed */
+	FLUSH_RFREE_LIST_OBJECTS, /* Rfree objects flushed */
+	CLAIM_REMOTE_LIST,	/* Remote freed list claimed */
+	CLAIM_REMOTE_LIST_OBJECTS, /* Remote freed objects claimed */
+	NR_SLQB_STAT_ITEMS
+};
+
+/*
+ * Singly-linked list with head, tail, and nr
+ */
+struct kmlist {
+	unsigned long	nr;
+	void 		**head;
+	void		**tail;
+};
+
+/*
+ * Every kmem_cache_list has a kmem_cache_remote_free structure, by which
+ * objects can be returned to the kmem_cache_list from remote CPUs.
+ */
+struct kmem_cache_remote_free {
+	spinlock_t	lock;
+	struct kmlist	list;
+} ____cacheline_aligned;
+
+/*
+ * A kmem_cache_list manages all the slabs and objects allocated from a given
+ * source. Per-cpu kmem_cache_lists allow node-local allocations. Per-node
+ * kmem_cache_lists allow off-node allocations (but require locking).
+ */
+struct kmem_cache_list {
+				/* Fastpath LIFO freelist of objects */
+	struct kmlist		freelist;
+#ifdef CONFIG_SMP
+				/* remote_free has reached a watermark */
+	int			remote_free_check;
+#endif
+				/* kmem_cache corresponding to this list */
+	struct kmem_cache	*cache;
+
+				/* Number of partial slabs (pages) */
+	unsigned long		nr_partial;
+
+				/* Slabs which have some free objects */
+	struct list_head	partial;
+
+				/* Total number of slabs allocated */
+	unsigned long		nr_slabs;
+
+#ifdef CONFIG_SMP
+	/*
+	 * In the case of per-cpu lists, remote_free is for objects freed by
+	 * non-owner CPU back to its home list. For per-node lists, remote_free
+	 * is always used to free objects.
+	 */
+	struct kmem_cache_remote_free remote_free;
+#endif
+
+#ifdef CONFIG_SLQB_STATS
+	unsigned long		stats[NR_SLQB_STAT_ITEMS];
+#endif
+} ____cacheline_aligned;
+
+/*
+ * Primary per-cpu, per-kmem_cache structure.
+ */
+struct kmem_cache_cpu {
+	struct kmem_cache_list	list;		/* List for node-local slabs */
+	unsigned int		colour_next;	/* Next colour offset to use */
+
+#ifdef CONFIG_SMP
+	/*
+	 * rlist is a list of objects that don't fit on list.freelist (ie.
+	 * wrong node). The objects all correspond to a given kmem_cache_list,
+	 * remote_cache_list. To free objects to another list, we must first
+	 * flush the existing objects, then switch remote_cache_list.
+	 *
+	 * An NR_CPUS or MAX_NUMNODES array would be nice here, but then we
+	 * get to O(NR_CPUS^2) memory consumption situation.
+	 */
+	struct kmlist		rlist;
+	struct kmem_cache_list	*remote_cache_list;
+#endif
+} ____cacheline_aligned;
+
+/*
+ * Per-node, per-kmem_cache structure. Used for node-specific allocations.
+ */
+struct kmem_cache_node {
+	struct kmem_cache_list	list;
+	spinlock_t		list_lock;	/* protects access to list */
+} ____cacheline_aligned;
+
+/*
+ * Management object for a slab cache.
+ */
+struct kmem_cache {
+	unsigned long	flags;
+	int		hiwater;	/* LIFO list high watermark */
+	int		freebatch;	/* LIFO freelist batch flush size */
+	int		objsize;	/* Size of object without meta data */
+	int		offset;		/* Free pointer offset. */
+	int		objects;	/* Number of objects in slab */
+
+	int		size;		/* Size of object including meta data */
+	int		order;		/* Allocation order */
+	gfp_t		allocflags;	/* gfp flags to use on allocation */
+	unsigned int	colour_range;	/* range of colour counter */
+	unsigned int	colour_off;	/* offset per colour */
+	void		(*ctor)(void *);
+
+	const char	*name;		/* Name (only for display!) */
+	struct list_head list;		/* List of slab caches */
+
+	int		align;		/* Alignment */
+	int		inuse;		/* Offset to metadata */
+
+#ifdef CONFIG_SLQB_SYSFS
+	struct kobject	kobj;		/* For sysfs */
+#endif
+#ifdef CONFIG_NUMA
+	struct kmem_cache_node	*node[MAX_NUMNODES];
+#endif
+#ifdef CONFIG_SMP
+	struct kmem_cache_cpu	*cpu_slab[NR_CPUS];
+#else
+	struct kmem_cache_cpu	cpu_slab;
+#endif
+};
+
+/*
+ * Kmalloc subsystem.
+ */
+#if defined(ARCH_KMALLOC_MINALIGN) && ARCH_KMALLOC_MINALIGN > 8
+#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
+#else
+#define KMALLOC_MIN_SIZE 8
+#endif
+
+#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
+#define KMALLOC_SHIFT_SLQB_HIGH (PAGE_SHIFT + 9)
+
+extern struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1];
+extern struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1];
+
+/*
+ * Constant size allocations use this path to find index into kmalloc caches
+ * arrays. get_slab() function is used for non-constant sizes.
+ */
+static __always_inline int kmalloc_index(size_t size)
+{
+	if (unlikely(!size))
+		return 0;
+	if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
+		return 0;
+
+	if (unlikely(size <= KMALLOC_MIN_SIZE))
+		return KMALLOC_SHIFT_LOW;
+
+#if L1_CACHE_BYTES < 64
+	if (size > 64 && size <= 96)
+		return 1;
+#endif
+#if L1_CACHE_BYTES < 128
+	if (size > 128 && size <= 192)
+		return 2;
+#endif
+	if (size <=	  8) return 3;
+	if (size <=	 16) return 4;
+	if (size <=	 32) return 5;
+	if (size <=	 64) return 6;
+	if (size <=	128) return 7;
+	if (size <=	256) return 8;
+	if (size <=	512) return 9;
+	if (size <=       1024) return 10;
+	if (size <=   2 * 1024) return 11;
+	if (size <=   4 * 1024) return 12;
+	if (size <=   8 * 1024) return 13;
+	if (size <=  16 * 1024) return 14;
+	if (size <=  32 * 1024) return 15;
+	if (size <=  64 * 1024) return 16;
+	if (size <= 128 * 1024) return 17;
+	if (size <= 256 * 1024) return 18;
+	if (size <= 512 * 1024) return 19;
+	if (size <= 1024 * 1024) return 20;
+	if (size <=  2 * 1024 * 1024) return 21;
+	return -1;
+}
+
+#ifdef CONFIG_ZONE_DMA
+#define SLQB_DMA __GFP_DMA
+#else
+/* Disable "DMA slabs" */
+#define SLQB_DMA (__force gfp_t)0
+#endif
+
+/*
+ * Find the kmalloc slab cache for a given combination of allocation flags and
+ * size. Should really only be used for constant 'size' arguments, due to
+ * bloat.
+ */
+static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+{
+	int index;
+
+	BUILD_BUG_ON(!__builtin_constant_p(size));
+
+	index = kmalloc_index(size);
+	if (unlikely(index == 0))
+		return NULL;
+
+	if (likely(!(flags & SLQB_DMA)))
+		return &kmalloc_caches[index];
+	else
+		return &kmalloc_caches_dma[index];
+}
+
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
+void *__kmalloc(size_t size, gfp_t flags);
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
+#endif
+
+#define KMALLOC_HEADER (ARCH_KMALLOC_MINALIGN < sizeof(void *) ?	\
+				sizeof(void *) : ARCH_KMALLOC_MINALIGN)
+
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
+{
+	if (__builtin_constant_p(size)) {
+		struct kmem_cache *s;
+
+		s = kmalloc_slab(size, flags);
+		if (unlikely(ZERO_OR_NULL_PTR(s)))
+			return s;
+
+		return kmem_cache_alloc(s, flags);
+	}
+	return __kmalloc(size, flags);
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	if (__builtin_constant_p(size)) {
+		struct kmem_cache *s;
+
+		s = kmalloc_slab(size, flags);
+		if (unlikely(ZERO_OR_NULL_PTR(s)))
+			return s;
+
+		return kmem_cache_alloc_node(s, flags, node);
+	}
+	return __kmalloc_node(size, flags, node);
+}
+#endif
+
+#endif /* _LINUX_SLQB_DEF_H */
--- a/mm/slqb.c
+++ b/mm/slqb.c