/*
kmregion version 3.5, a region-based memory allocator
Copyright (C) 02021 Kragen Javier Sitaker
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/* A simple region allocator providing pointer-bumping allocation for
* C programs, similar to what modern languages use. This can be
* dramatically faster than malloc/free, typically a factor of 10.
*
*
* Example usage:
*
* #include
* #include
* #include
* #include
* #include "kmregion.h"
*
* ...
*
* kmregion *p = km_start(km_libc_disc, NULL);
* if (!p) abort();
* struct n { int i; struct n *next; } *list = NULL;
* for (size_t i = 0; i < 5000; i++) {
* struct n *q = km_new(p, sizeof(*q));
* if (!q) abort();
* q->i = i;
* q->next = list;
* list = q;
* }
* km_end(p);
*
*
* On my laptop, *when it’s applicable*, in a simple test like the one
* above it's about an order of magnitude faster than glibc 2.13-38+deb7u12
* malloc, and it’s also simpler to use and free of fragmentation.
* But it's only applicable when you can deallocate a whole region at
* a time, rather than individual objects. (glibc’s obstack API is
* similar but more flexible.) Sometimes such an allocator is called
* an “arena allocator”, though that term is also used with other
* meanings. Many modern language implementations do all their small
* allocations this way by using a copying garbage collector.
*
* To use kmregion, you create a new region with km_start, passing it
* a “memory allocation discipline” to use for its allocation.
* Typically this is km_libc_disc. If km_start fails, it will return
* NULL. Otherwise, you can allocate any number of times in that
* region with km_new, and then efficiently deallocate all of those
* allocations with km_end.
*
* By default, km_new will handle allocation errors by returning NULL,
* but if you pass a pointer to a jmp_buf as km_start’s second
* parameter, it will instead longjmp to that jmp_buf when allocations
* in km_new fail. At this point you should probably call km_end if
* you intend to continue execution. km_start always reports its own
* allocation errors by returning NULL, because if it were to use
* longjmp rather than returning a pointer, you would be in the
* position of trying to handle the error by passing an uninitialized
* pointer to km_end. Don’t cross the streams.
*
* The above performance results were obtained as follows on an
* “Intel(R) Core(TM) i7-3840QM CPU @ 2.80GHz” CPU:
*
* $ gcc --version
* gcc (Debian 4.7.2-5) 4.7.2 ...
* $ gcc -O3 -Wall -std=gnu99 malloc_test.c kmregion.c -o malloc_test
* $ gcc -O3 -Wall -std=gnu99 kmregion_example.c kmregion.c -o kmregion_example
* $ \time ./kmregion_example 100000
* Total: 12497500
* 0.96user 0.00system 0:00.96elapsed 99%CPU (0avgtext+0avgdata 596maxresident)k
* 0inputs+0outputs (0major+188minor)pagefaults 0swaps
* $ \time ./malloc_test 100000
* Total: 12497500
* 9.72user 0.00system 0:09.74elapsed 99%CPU (0avgtext+0avgdata 668maxresident)k
* 0inputs+0outputs (0major+206minor)pagefaults 0swaps
*
* In both cases, that’s 100'000 iterations of allocating and freeing
* a 5000-node linked list, plus totaling up the numbers on it once,
* so kmregion_example is allocating and initializing 520
* million list nodes per wallclock second; malloc only manages 52
* million.
*
* On a different machine with a “Intel(R) Xeon(R) Gold 5218 CPU @
* 2.30GHz” processor, GCC 9.3.0-17ubuntu1~20.04, and glibc
* 2.31-0ubuntu9.1, malloc is considerably faster (70 million
* allocations per second) and kmregion about the same speed (515
* million allocations per second), closing the gap so kmregion is
* only 7× faster:
*
* $ \time ./malloc_test 100000
* Total: 12497500
* 7.06user 0.00system 0:07.06elapsed 100%CPU (0avgtext+0avgdata 1672maxresident)k
* 0inputs+0outputs (0major+102minor)pagefaults 0swaps
* $ \time ./kmregion_example 100000
* Total: 12497500
* 0.97user 0.00system 0:00.97elapsed 100%CPU (0avgtext+0avgdata 1644maxresident)k
* 0inputs+0outputs (0major+88minor)pagefaults 0swaps
*
* Performance results on that Xeon machine were not very consistent,
* perhaps because it was heavily loaded. On the Core i7,
* kmregion was getting 5.4 clock cycles per allocation, and on the
* Xeon, 4.5 clock cycles per allocation.
*/
#ifndef __KMREGION_H
#define __KMREGION_H
/* km_disc: A memory allocation discipline. Describes the interface
* to some underlying memory allocation system. Usually you use
* km_libc_disc. */
typedef struct {
/* Additional context pointer for malloc and free, as in Tcl. Some
* allocators don’t need this. */
void *userdata;
/* Allocation function for getting blocks. Returns NULL on failure. */
void *(*malloc)(size_t nbytes, void *userdata);
/* Deallocation function for freeing them. Cannot fail. */
void (*free)(void *block, void *userdata);
} km_disc;
extern km_disc km_libc_disc;
/* This structure’s layout is, in theory, private to the kmregion
* library, but it’s exposed here to permit the C compiler to inline
* km_new. */
typedef struct {
size_t n; /* Allocation pointer into current block. */
char *bp; /* Start of current block; base pointer. */
/* Members below this line are not used in inline functions, so they
* can change without breaking ABI compatibility. */
km_disc disc; /* Discipline for allocating blocks. */
jmp_buf *on_error; /* If not NULL, longjmp here on error. */
void **blocks; /* Array of pointers to allocated blocks. */
size_t block_capacity; /* Size of pointer array. */
size_t n_blocks; /* Number of allocated blocks within it. */
} kmregion;
kmregion *km_start(km_disc, jmp_buf *on_error);
void km_end(kmregion *);
void *km_slow_path_allocate(kmregion *, size_t);
static inline void *
km_new(kmregion *r, size_t n)
{
/* Round up to 8 bytes (or 4 on i386) so all allocations are
* aligned. For constant sizes, the compiler normally
* constant-folds this away. This results in poor density for
* allocations under 8 bytes, but those are rare enough on modern
* platforms that this shouldn’t matter.
*/
n = (n + alignof(void*) - 1) & ~(alignof(void*) - 1);
/* I used to use uintptr_t here, but I’m not sure that was actually
standards-compliant C. It’s apparently being debated:
And I was also having a hard time with underflow errors.
*/
size_t p = r->n - n;
if (p <= r->n) {
r->n = p;
return r->bp + p;
}
return km_slow_path_allocate(r, n);
}
#endif