/* kmregion version 3.5, a region-based memory allocator Copyright (C) 02021 Kragen Javier Sitaker This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /* A simple region allocator providing pointer-bumping allocation for * C programs, similar to what modern languages use. This can be * dramatically faster than malloc/free, typically a factor of 10. * * * Example usage: * * #include * #include * #include * #include * #include "kmregion.h" * * ... * * kmregion *p = km_start(km_libc_disc, NULL); * if (!p) abort(); * struct n { int i; struct n *next; } *list = NULL; * for (size_t i = 0; i < 5000; i++) { * struct n *q = km_new(p, sizeof(*q)); * if (!q) abort(); * q->i = i; * q->next = list; * list = q; * } * km_end(p); * * * On my laptop, *when it’s applicable*, in a simple test like the one * above it's about an order of magnitude faster than glibc 2.13-38+deb7u12 * malloc, and it’s also simpler to use and free of fragmentation. * But it's only applicable when you can deallocate a whole region at * a time, rather than individual objects. (glibc’s obstack API is * similar but more flexible.) Sometimes such an allocator is called * an “arena allocator”, though that term is also used with other * meanings. Many modern language implementations do all their small * allocations this way by using a copying garbage collector. * * To use kmregion, you create a new region with km_start, passing it * a “memory allocation discipline” to use for its allocation. * Typically this is km_libc_disc. If km_start fails, it will return * NULL. Otherwise, you can allocate any number of times in that * region with km_new, and then efficiently deallocate all of those * allocations with km_end. * * By default, km_new will handle allocation errors by returning NULL, * but if you pass a pointer to a jmp_buf as km_start’s second * parameter, it will instead longjmp to that jmp_buf when allocations * in km_new fail. At this point you should probably call km_end if * you intend to continue execution. km_start always reports its own * allocation errors by returning NULL, because if it were to use * longjmp rather than returning a pointer, you would be in the * position of trying to handle the error by passing an uninitialized * pointer to km_end. Don’t cross the streams. * * The above performance results were obtained as follows on an * “Intel(R) Core(TM) i7-3840QM CPU @ 2.80GHz” CPU: * * $ gcc --version * gcc (Debian 4.7.2-5) 4.7.2 ... * $ gcc -O3 -Wall -std=gnu99 malloc_test.c kmregion.c -o malloc_test * $ gcc -O3 -Wall -std=gnu99 kmregion_example.c kmregion.c -o kmregion_example * $ \time ./kmregion_example 100000 * Total: 12497500 * 0.96user 0.00system 0:00.96elapsed 99%CPU (0avgtext+0avgdata 596maxresident)k * 0inputs+0outputs (0major+188minor)pagefaults 0swaps * $ \time ./malloc_test 100000 * Total: 12497500 * 9.72user 0.00system 0:09.74elapsed 99%CPU (0avgtext+0avgdata 668maxresident)k * 0inputs+0outputs (0major+206minor)pagefaults 0swaps * * In both cases, that’s 100'000 iterations of allocating and freeing * a 5000-node linked list, plus totaling up the numbers on it once, * so kmregion_example is allocating and initializing 520 * million list nodes per wallclock second; malloc only manages 52 * million. * * On a different machine with a “Intel(R) Xeon(R) Gold 5218 CPU @ * 2.30GHz” processor, GCC 9.3.0-17ubuntu1~20.04, and glibc * 2.31-0ubuntu9.1, malloc is considerably faster (70 million * allocations per second) and kmregion about the same speed (515 * million allocations per second), closing the gap so kmregion is * only 7× faster: * * $ \time ./malloc_test 100000 * Total: 12497500 * 7.06user 0.00system 0:07.06elapsed 100%CPU (0avgtext+0avgdata 1672maxresident)k * 0inputs+0outputs (0major+102minor)pagefaults 0swaps * $ \time ./kmregion_example 100000 * Total: 12497500 * 0.97user 0.00system 0:00.97elapsed 100%CPU (0avgtext+0avgdata 1644maxresident)k * 0inputs+0outputs (0major+88minor)pagefaults 0swaps * * Performance results on that Xeon machine were not very consistent, * perhaps because it was heavily loaded. On the Core i7, * kmregion was getting 5.4 clock cycles per allocation, and on the * Xeon, 4.5 clock cycles per allocation. */ #ifndef __KMREGION_H #define __KMREGION_H /* km_disc: A memory allocation discipline. Describes the interface * to some underlying memory allocation system. Usually you use * km_libc_disc. */ typedef struct { /* Additional context pointer for malloc and free, as in Tcl. Some * allocators don’t need this. */ void *userdata; /* Allocation function for getting blocks. Returns NULL on failure. */ void *(*malloc)(size_t nbytes, void *userdata); /* Deallocation function for freeing them. Cannot fail. */ void (*free)(void *block, void *userdata); } km_disc; extern km_disc km_libc_disc; /* This structure’s layout is, in theory, private to the kmregion * library, but it’s exposed here to permit the C compiler to inline * km_new. */ typedef struct { size_t n; /* Allocation pointer into current block. */ char *bp; /* Start of current block; base pointer. */ /* Members below this line are not used in inline functions, so they * can change without breaking ABI compatibility. */ km_disc disc; /* Discipline for allocating blocks. */ jmp_buf *on_error; /* If not NULL, longjmp here on error. */ void **blocks; /* Array of pointers to allocated blocks. */ size_t block_capacity; /* Size of pointer array. */ size_t n_blocks; /* Number of allocated blocks within it. */ } kmregion; kmregion *km_start(km_disc, jmp_buf *on_error); void km_end(kmregion *); void *km_slow_path_allocate(kmregion *, size_t); static inline void * km_new(kmregion *r, size_t n) { /* Round up to 8 bytes (or 4 on i386) so all allocations are * aligned. For constant sizes, the compiler normally * constant-folds this away. This results in poor density for * allocations under 8 bytes, but those are rare enough on modern * platforms that this shouldn’t matter. */ n = (n + alignof(void*) - 1) & ~(alignof(void*) - 1); /* I used to use uintptr_t here, but I’m not sure that was actually standards-compliant C. It’s apparently being debated: And I was also having a hard time with underflow errors. */ size_t p = r->n - n; if (p <= r->n) { r->n = p; return r->bp + p; } return km_slow_path_allocate(r, n); } #endif