/*
    kmregion version 3.5, a region-based memory allocator
    Copyright (C) 02021  Kragen Javier Sitaker

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

*/

/* A simple region allocator providing pointer-bumping allocation for
 * C programs, similar to what modern languages use.  This can be
 * dramatically faster than malloc/free, typically a factor of 10.
 *
 *
 * Example usage:
 *
 *   #include <setjmp.h>
 *   #include <stddef.h>
 *   #include <stdint.h>
 *   #include <stdalign.h>
 *   #include "kmregion.h"
 *
 *   ...
 *
 *   kmregion *p = km_start(km_libc_disc, NULL);
 *   if (!p) abort();
 *   struct n { int i; struct n *next; } *list = NULL;
 *   for (size_t i = 0; i < 5000; i++) {
 *     struct n *q = km_new(p, sizeof(*q));
 *     if (!q) abort();
 *     q->i = i;
 *     q->next = list;
 *     list = q;
 *   }
 *   km_end(p);
 * 
 *
 * On my laptop, *when it’s applicable*, in a simple test like the one
 * above it's about an order of magnitude faster than glibc 2.13-38+deb7u12
 * malloc, and it’s also simpler to use and free of fragmentation.
 * But it's only applicable when you can deallocate a whole region at
 * a time, rather than individual objects.  (glibc’s obstack API is
 * similar but more flexible.)  Sometimes such an allocator is called
 * an “arena allocator”, though that term is also used with other
 * meanings.  Many modern language implementations do all their small
 * allocations this way by using a copying garbage collector.
 * 
 * To use kmregion, you create a new region with km_start, passing it
 * a “memory allocation discipline” to use for its allocation.
 * Typically this is km_libc_disc.  If km_start fails, it will return
 * NULL.  Otherwise, you can allocate any number of times in that
 * region with km_new, and then efficiently deallocate all of those
 * allocations with km_end.
 *
 * By default, km_new will handle allocation errors by returning NULL,
 * but if you pass a pointer to a jmp_buf as km_start’s second
 * parameter, it will instead longjmp to that jmp_buf when allocations
 * in km_new fail.  At this point you should probably call km_end if
 * you intend to continue execution.  km_start always reports its own
 * allocation errors by returning NULL, because if it were to use
 * longjmp rather than returning a pointer, you would be in the
 * position of trying to handle the error by passing an uninitialized
 * pointer to km_end.  Don’t cross the streams.
 *
 * The above performance results were obtained as follows on an
 * “Intel(R) Core(TM) i7-3840QM CPU @ 2.80GHz” CPU:
 *
 * $ gcc --version
 * gcc (Debian 4.7.2-5) 4.7.2 ...
 * $ gcc -O3 -Wall -std=gnu99 malloc_test.c kmregion.c -o malloc_test
 * $ gcc -O3 -Wall -std=gnu99 kmregion_example.c kmregion.c -o kmregion_example
 * $ \time ./kmregion_example 100000
 * Total: 12497500
 * 0.96user 0.00system 0:00.96elapsed 99%CPU (0avgtext+0avgdata 596maxresident)k
 * 0inputs+0outputs (0major+188minor)pagefaults 0swaps
 * $ \time ./malloc_test 100000
 * Total: 12497500
 * 9.72user 0.00system 0:09.74elapsed 99%CPU (0avgtext+0avgdata 668maxresident)k
 * 0inputs+0outputs (0major+206minor)pagefaults 0swaps
 * 
 * In both cases, that’s 100'000 iterations of allocating and freeing
 * a 5000-node linked list, plus totaling up the numbers on it once,
 * so kmregion_example is allocating and initializing 520
 * million list nodes per wallclock second; malloc only manages 52
 * million.
 *
 * On a different machine with a “Intel(R) Xeon(R) Gold 5218 CPU @
 * 2.30GHz” processor, GCC 9.3.0-17ubuntu1~20.04, and glibc
 * 2.31-0ubuntu9.1, malloc is considerably faster (70 million
 * allocations per second) and kmregion about the same speed (515
 * million allocations per second), closing the gap so kmregion is
 * only 7× faster:
 * 
 * $ \time ./malloc_test 100000
 * Total: 12497500
 * 7.06user 0.00system 0:07.06elapsed 100%CPU (0avgtext+0avgdata 1672maxresident)k
 * 0inputs+0outputs (0major+102minor)pagefaults 0swaps
 * $ \time ./kmregion_example 100000
 * Total: 12497500
 * 0.97user 0.00system 0:00.97elapsed 100%CPU (0avgtext+0avgdata 1644maxresident)k
 * 0inputs+0outputs (0major+88minor)pagefaults 0swaps
 *
 * Performance results on that Xeon machine were not very consistent,
 * perhaps because it was heavily loaded.  On the Core i7,
 * kmregion was getting 5.4 clock cycles per allocation, and on the
 * Xeon, 4.5 clock cycles per allocation.
 */

#ifndef __KMREGION_H
#define __KMREGION_H

/* km_disc: A memory allocation discipline.  Describes the interface
 * to some underlying memory allocation system.  Usually you use
 * km_libc_disc. */
typedef struct {
  /* Additional context pointer for malloc and free, as in Tcl.  Some
   * allocators don’t need this. */
  void *userdata;
  /* Allocation function for getting blocks.  Returns NULL on failure. */
  void *(*malloc)(size_t nbytes, void *userdata);
  /* Deallocation function for freeing them.  Cannot fail. */
  void (*free)(void *block, void *userdata);
} km_disc;

extern km_disc km_libc_disc;

/* This structure’s layout is, in theory, private to the kmregion
 * library, but it’s exposed here to permit the C compiler to inline
 * km_new. */
typedef struct {
  size_t n;               /* Allocation pointer into current block. */
  char *bp;               /* Start of current block; base pointer. */

  /* Members below this line are not used in inline functions, so they
   * can change without breaking ABI compatibility. */

  km_disc disc;           /* Discipline for allocating blocks. */
  jmp_buf *on_error;      /* If not NULL, longjmp here on error. */
  void **blocks;          /* Array of pointers to allocated blocks. */
  size_t block_capacity;  /* Size of pointer array. */
  size_t n_blocks;        /* Number of allocated blocks within it. */
} kmregion;

kmregion *km_start(km_disc, jmp_buf *on_error);
void km_end(kmregion *);
void *km_slow_path_allocate(kmregion *, size_t);

static inline void *
km_new(kmregion *r, size_t n)
{
  /* Round up to 8 bytes (or 4 on i386) so all allocations are
   * aligned.  For constant sizes, the compiler normally
   * constant-folds this away.  This results in poor density for
   * allocations under 8 bytes, but those are rare enough on modern
   * platforms that this shouldn’t matter.
   */
  n = (n + alignof(void*) - 1) & ~(alignof(void*) - 1);

  /* I used to use uintptr_t here, but I’m not sure that was actually
     standards-compliant C.  It’s apparently being debated:
     <http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2263.htm#pointer-provenance-via-integer-types>
     And I was also having a hard time with underflow errors.
  */
  size_t p = r->n - n;
  if (p <= r->n) {
    r->n = p;
    return r->bp + p;
  }

  return km_slow_path_allocate(r, n);
}

#endif