From 0005ff62f470dec86c1f5726b4b1f5246adc019c Mon Sep 17 00:00:00 2001 From: struct Date: Sun, 15 Mar 2026 14:46:28 -0400 Subject: [PATCH 1/2] performance and bug fixes --- Makefile | 17 +++++++---- PERFORMANCE.md | 52 ++++++++++++++++++++-------------- README.md | 5 +++- include/iso_alloc_ds.h | 13 +++++---- include/iso_alloc_internal.h | 8 ++++++ src/iso_alloc.c | 55 +++++++++++++++++++++--------------- src/iso_alloc_sanity.c | 29 +++++++++++++++++++ tests/bzero_sanity.c | 7 ++--- 8 files changed, 127 insertions(+), 59 deletions(-) diff --git a/Makefile b/Makefile index 6041ed4..d635553 100644 --- a/Makefile +++ b/Makefile @@ -229,9 +229,14 @@ PROTECT_FREE_BIG_ZONES = -DPROTECT_FREE_BIG_ZONES=0 ## incurs a small performance cost MASK_PTRS = -DMASK_PTRS=1 -## IsoAlloc uses ARM64 Neon instructions where possible. You can -## explicitly disable that here +## IsoAlloc uses ARM64 Neon instructions where possible. Automatically +## enabled on ARM/AArch64 hosts, disabled everywhere else. +ARCH := $(shell uname -m) +ifneq ($(filter aarch64 arm%,$(ARCH)),) +DONT_USE_NEON = -DDONT_USE_NEON=0 +else DONT_USE_NEON = -DDONT_USE_NEON=1 +endif ## We start with the standard C++ specifics but giving ## the liberty to choose the gnu++* variants and/or @@ -350,9 +355,9 @@ library: clean ## ABORT_ON_UNOWNED_PTR=0 silently drops pointers not owned by isoalloc ## (e.g. those allocated by libc before the isoalloc constructor fires) ## instead of aborting. All other flags are identical to 'library'. -library_perf: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0 -library_perf: clean - @echo "make library_perf" +library_less_strict: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0 +library_less_strict: clean + @echo "make library_less_strict" $(CC) $(CFLAGS) $(LIBRARY) $(OPTIMIZE) $(OS_FLAGS) $(C_SRCS) -o $(BUILD_DIR)/$(LIBNAME) $(STRIP) @@ -456,7 +461,7 @@ libc_sanity_tests: clean library_debug_unit_tests $(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/memcpy_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/memcpy_sanity $(LDFLAGS) $(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/memmove_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/memmove_sanity $(LDFLAGS) $(CC) $(CFLAGS) $(EXE_CFLAGS) $(DEBUG_LOG_FLAGS) $(GDB_FLAGS) $(OS_FLAGS) tests/bzero_sanity.c $(ISO_ALLOC_PRINTF_SRC) -o $(BUILD_DIR)/bzero_sanity $(LDFLAGS) - build/memset_sanity ; build/memcpy_sanity; build/memmove_sanity; build/bzero_sanity ; + LD_LIBRARY_PATH=build/ build/memset_sanity ; LD_LIBRARY_PATH=build/ build/memcpy_sanity; LD_LIBRARY_PATH=build/ build/memmove_sanity; LD_LIBRARY_PATH=build/ build/bzero_sanity fuzz_test: clean library_debug_unit_tests @echo "make fuzz_test" diff --git a/PERFORMANCE.md b/PERFORMANCE.md index d3f18f6..f6a8626 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -34,6 +34,16 @@ If you know your program will not require multi-threaded access to IsoAlloc you `DISABLE_CANARY` can be set to 1 to disable the creation and verification of canary chunks. This removes a useful security feature but will significantly improve performance and RSS. +`MASK_PTRS` is enabled by default and causes the `user_pages_start` and `bitmap_start` pointers stored in every zone's metadata to be XOR'd with a per-zone random secret between alloc and free operations. This protects against attackers who can read or corrupt zone metadata. Each alloc and free pays a small cost for these XOR operations. Setting `MASK_PTRS=0` removes this overhead at the cost of this security property. + +`CANARY_COUNT_DIV` in `conf.h` controls what fraction of chunks in a zone are reserved as canaries. It is used as a right-shift on the total chunk count: `chunk_count >> CANARY_COUNT_DIV`. The default value of 7 reserves less than 1% of chunks. Increasing this value reduces canary density and frees more chunks for user allocations; decreasing it increases security coverage at the cost of usable memory. + +`ZONE_ALLOC_RETIRE` in `conf.h` controls how frequently zones are retired and replaced. A zone is retired once it has completed `ZONE_ALLOC_RETIRE * max_chunk_count_for_zone` total alloc/free cycles. Lowering this value causes zones to be replaced more often, reducing the window for use-after-free exploitation but increasing the frequency of zone creation. `BIG_ZONE_ALLOC_RETIRE` is the equivalent for big zones. + +`SMALL_MEM_STARTUP` reduces the number and size of default zones created at startup. This decreases initial RSS at the cost of more frequent zone creation for programs with diverse allocation sizes. + +`STRONG_SIZE_ISOLATION` enforces stricter isolation by size class. When enabled, chunk sizes are rounded up to a smaller set of buckets which increases isolation between differently-sized allocations. This may increase per-allocation waste but reduces cross-size heap exploitation primitives. + By default IsoAlloc will attempt to use Huge Pages (for both Linux and Mac OS) for any allocations that are a multiple of 2 mb in size. This is the default huge page size on most systems but it might not be on yours. On Linux you can check the value for your system by running the following command: ``` @@ -145,33 +155,33 @@ The following benchmarks were collected from [mimalloc-bench](https://github.com ``` #------------------------------------------------------------------ # test alloc time rss user sys page-faults page-reclaims -cfrac je 02.99 4912 2.99 0.00 0 454 -cfrac mi 03.01 2484 3.00 0.00 0 346 -cfrac iso 05.84 26616 5.75 0.09 0 6502 +cfrac je 03.07 4560 3.06 0.00 0 455 +cfrac mi 02.92 2676 2.92 0.00 0 348 +cfrac iso 05.16 30764 5.08 0.08 0 7497 -espresso je 02.52 4872 2.50 0.01 0 538 -espresso mi 02.46 3060 2.45 0.01 0 3637 -espresso iso 03.65 69876 3.56 0.09 0 21695 +espresso je 02.49 5032 2.48 0.00 0 550 +espresso mi 02.47 3004 2.45 0.01 0 3636 +espresso iso 03.25 69124 3.16 0.09 0 30105 -barnes je 01.62 60268 1.59 0.02 0 16687 -barnes mi 01.71 57672 1.68 0.02 0 16550 -barnes iso 01.66 74628 1.62 0.03 0 20851 +barnes je 01.71 59916 1.68 0.02 0 16684 +barnes mi 01.64 57864 1.61 0.02 0 16550 +barnes iso 01.65 74968 1.61 0.03 0 20851 -gs je 00.16 37592 0.15 0.01 0 5808 -gs mi 00.16 32588 0.13 0.02 0 5109 -gs iso 00.23 71152 0.16 0.07 0 19698 +gs je 00.15 37756 0.13 0.01 0 5812 +gs mi 00.15 33668 0.14 0.01 0 5110 +gs iso 00.23 67960 0.16 0.06 0 18846 -larsonN je 1.171 266596 98.81 0.92 0 409842 -larsonN mi 1.016 299768 99.38 0.44 0 83755 -larsonN iso 918.582 126528 99.64 0.37 0 31368 +larsonN je 1.153 269184 98.81 1.00 0 419378 +larsonN mi 1.037 301044 99.34 0.41 0 83267 +larsonN iso 1304.061 121072 6.10 70.16 0 30031 -rocksdb je 02.48 162424 2.05 0.63 0 38384 -rocksdb mi 02.48 159812 2.04 0.66 0 37464 -rocksdb iso 02.74 197220 2.49 0.55 0 46815 +rocksdb je 02.49 162976 2.09 0.60 0 38215 +rocksdb mi 02.22 160392 1.86 0.54 0 37563 +rocksdb iso 02.87 197548 2.58 0.59 0 46899 -redis je 3.180 9496 0.14 0.02 0 1538 -redis mi 3.080 7088 0.12 0.03 0 1256 -redis iso 6.880 52816 0.31 0.05 0 16317 +redis je 3.319 9484 0.14 0.02 0 1540 +redis mi 2.840 7124 0.12 0.02 0 1254 +redis iso 7.340 49712 0.34 0.04 0 14959 ``` IsoAlloc isn't quite ready for performance sensitive server workloads. However it's more than fast enough for client side mobile/desktop applications with risky C/C++ attack surfaces. These environments have threat models similar to what IsoAlloc was designed for. diff --git a/README.md b/README.md index 9c79896..f60d73d 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ When enabled, the `CPU_PIN` feature will restrict allocations from a given zone * A chunk can be permanently free'd with a call to `iso_free_permanently`. * If `SANITIZE_CHUNKS` is set all user chunks are cleared when passed to `iso_free` with the constant `0xde`. * When freeing a chunk the canary in adjacent chunks above/below are verified. -* Some important zone metadata pointers are masked in-between `iso_alloc` and `iso_free` operations. +* When `MASK_PTRS` is enabled (default) the `user_pages_start` and `bitmap_start` pointers stored in zone metadata are XOR'd with a per-zone random secret between alloc and free operations, making them useless to an attacker who reads or corrupts zone metadata. * Passing a pointer to `iso_free` that was not allocated with `iso_alloc` will abort. * Pointers passed to `iso_free` must be 8 byte aligned, and a multiple of the zone chunk size. * The free bit slot cache provides a chunk quarantine or delayed free mechanism. @@ -76,6 +76,7 @@ When enabled, the `CPU_PIN` feature will restrict allocations from a given zone * Randomized hints are passed to `mmap` to ensure contiguous page ranges are not allocated. * When `ABORT_ON_NULL` is enabled IsoAlloc will abort instead of returning `NULL`. * By default `NO_ZERO_ALLOCATIONS` will return a pointer to a page marked `PROT_NONE` for all `0` sized allocations. +* When `ABORT_ON_UNOWNED_PTR` is enabled (default) IsoAlloc will abort whenever it is passed a pointer it does not own. * When `ABORT_NO_ENTROPY` is enabled IsoAlloc will abort when it can't gather enough entropy. * When `RANDOMIZE_FREELIST` is enabled IsoAlloc will randomize the free list upon creation. May have a perf hit. * Zones are retired and replaced after they've allocated and freed a specific number of chunks. This is calculated as `ZONE_ALLOC_RETIRE * max_chunk_count_for_zone`. @@ -94,6 +95,8 @@ The Makefile targets are very simple: `make library` - Builds a release version of the library without C++ support +`make library_less_strict` - Builds a release library with `ABORT_ON_UNOWNED_PTR=0`. Recommended when using IsoAlloc via `LD_PRELOAD`. + `make library_debug` - Builds a debug version of the library `make library_debug_no_output` - Builds a debug version of the library with no logging output diff --git a/include/iso_alloc_ds.h b/include/iso_alloc_ds.h index 8f7fc51..233f6b7 100644 --- a/include/iso_alloc_ds.h +++ b/include/iso_alloc_ds.h @@ -12,7 +12,7 @@ #define SZ_TO_ZONE_LOOKUP_IDX(size) size >> 4 #define CHUNK_TO_ZONE_TABLE_SZ (65535 * sizeof(uint16_t)) -#define ADDR_TO_CHUNK_TABLE(p) (((uintptr_t) p >> 32) & 0xffff) +#define ADDR_TO_CHUNK_TABLE(p) (((uintptr_t) p >> 22) & 0xffff) typedef int64_t bit_slot_t; typedef int64_t bitmap_index_t; @@ -36,7 +36,7 @@ typedef struct { int64_t next_free_bit_slot; /* The last bit slot returned by get_next_free_bit_slot */ uint64_t canary_secret; /* Each zone has its own canary secret */ uint64_t pointer_mask; /* Each zone has its own pointer protection secret */ - bitmap_index_t max_bitmap_idx; /* Max bitmap index for this bitmap */ + uint16_t max_bitmap_idx; /* Max bitmap index for this bitmap */ uint32_t chunk_size; /* Size of chunks managed by this zone */ free_bit_slot_t free_bit_slots_usable; /* The oldest members of the free cache are served first */ free_bit_slot_t free_bit_slots_index; /* Tracks how many entries in the cache are filled */ @@ -50,7 +50,7 @@ typedef struct { uint8_t cpu_core; /* What CPU core this zone is pinned to */ #endif /* Warm/cold fields: accessed less frequently */ - uint32_t bitmap_size; /* Size of the bitmap in bytes */ + uint16_t bitmap_size; /* Size of the bitmap in bytes */ uint32_t af_count; /* Increment/Decrement with each alloc/free operation */ uint32_t chunk_count; /* Total number of chunks in this zone */ uint32_t alloc_count; /* Total number of lifetime allocations */ @@ -133,9 +133,12 @@ typedef struct { * it can find the next zone that holds the same size * chunks. The lookup table helps us find the first zone * that holds a specific size in O(1) time */ - zone_lookup_table_t zone_lookup_table[ZONE_LOOKUP_TABLE_SZ]; + /* Array sized to cover indices 0..(SMALL_SIZE_MAX>>4) inclusive, then + * rounded to a multiple of 4 entries so the array occupies a whole + * number of 8-byte words and bitmaps[] remains naturally aligned. */ + zone_lookup_table_t zone_lookup_table[(SMALL_SIZE_MAX >> 4) + 4]; /* For chunk sizes >= 1024 our bitmap size is smaller - * than a page. This optimization preallocates pages to + * than a page. This optimization preallocates pages tog * hold multiple bitmaps for these zones */ iso_alloc_bitmap_t bitmaps[sizeof(small_bitmap_sizes) / sizeof(int)]; uint64_t zone_handle_mask; diff --git a/include/iso_alloc_internal.h b/include/iso_alloc_internal.h index c8f87cb..1f9fa1a 100644 --- a/include/iso_alloc_internal.h +++ b/include/iso_alloc_internal.h @@ -300,6 +300,14 @@ extern uint32_t g_page_size_shift; static_assert(SMALLEST_CHUNK_SZ >= 16, "SMALLEST_CHUNK_SZ is too small, must be at least 16"); static_assert(SMALL_SIZE_MAX <= 131072, "SMALL_SIZE_MAX is too big, cannot exceed 131072"); +/* bitmap_size = (ZONE_USER_SIZE / SMALLEST_CHUNK_SZ) * BITS_PER_CHUNK / BITS_PER_BYTE + * max_bitmap_idx = bitmap_size / sizeof(uint64_t) + * Both fields are uint16_t in iso_alloc_zone_t, so verify they fit. */ +static_assert((ZONE_USER_SIZE * BITS_PER_CHUNK / BITS_PER_BYTE / SMALLEST_CHUNK_SZ) <= UINT16_MAX, + "bitmap_size overflows uint16_t: SMALLEST_CHUNK_SZ is too small (must be > 16)"); +static_assert((ZONE_USER_SIZE * BITS_PER_CHUNK / BITS_PER_BYTE / SMALLEST_CHUNK_SZ / sizeof(uint64_t)) <= UINT16_MAX, + "max_bitmap_idx overflows uint16_t: SMALLEST_CHUNK_SZ is too small"); + #if THREAD_SUPPORT #if USE_SPINLOCK extern atomic_flag root_busy_flag; diff --git a/src/iso_alloc.c b/src/iso_alloc.c index aff0d77..c8177d4 100644 --- a/src/iso_alloc.c +++ b/src/iso_alloc.c @@ -1018,9 +1018,12 @@ INTERNAL_HIDDEN INLINE void populate_zone_cache(iso_alloc_zone_t *zone) { tzc[_zone_cache_count].chunk_size = zone->chunk_size; _zone_cache_count++; } else { - _zone_cache_count = 0; + /* Evict oldest entry (index 0) via FIFO: shift all entries down by one */ + memmove(&tzc[0], &tzc[1], (ZONE_CACHE_SZ - 1) * sizeof(_tzc)); + _zone_cache_count = ZONE_CACHE_SZ - 1; tzc[_zone_cache_count].zone = zone; tzc[_zone_cache_count].chunk_size = zone->chunk_size; + _zone_cache_count++; } zone_cache_count = _zone_cache_count; @@ -1073,6 +1076,24 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s LOG_AND_ABORT("Private zone %d cannot hold chunks of size %d, only %d", zone->index, size, zone->chunk_size); } + /* Pre-lock hot path: scan the thread-local zone cache using only + * thread-local data (chunk_size comparison and pointer read). No + * zone struct fields are dereferenced here. Validation happens + * under the lock via is_zone_usable(). */ + iso_alloc_zone_t *cached_zone = NULL; + + if(LIKELY(zone == NULL && size <= SMALL_SIZE_MAX)) { + size_t _zone_cache_count = zone_cache_count; + _tzc *tzc = zone_cache; + + for(size_t i = _zone_cache_count; i-- > 0;) { + if(tzc[i].chunk_size >= size) { + cached_zone = tzc[i].zone; + break; + } + } + } + LOCK_ROOT(); if(UNLIKELY(_root == NULL)) { @@ -1100,7 +1121,7 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s #if ALLOC_SANITY /* We don't sample if we are allocating from a private zone */ - if(zone != NULL) { + if(zone != NULL && zone->internal == true) { if(size < g_page_size && _sane_sampled < MAX_SANE_SAMPLES) { /* If we chose to sample this allocation then * _iso_alloc_sample will call UNLOCK_ROOT() */ @@ -1124,23 +1145,12 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s _verify_all_zones(); #endif if(LIKELY(zone == NULL)) { - /* Hot Path: Check the zone cache for a zone this - * thread recently used for an alloc/free operation. - * It's likely we are allocating a similar size chunk - * and this will speed up that operation. - * Scan newest-to-oldest (LIFO) since the most recently - * used zone is most likely to have free slots available. */ - size_t _zone_cache_count = zone_cache_count; - _tzc *tzc = zone_cache; - - for(size_t i = _zone_cache_count; i-- > 0;) { - if(tzc[i].chunk_size >= size) { - iso_alloc_zone_t *_zone = tzc[i].zone; - if(is_zone_usable(_zone, size) != NULL) { - zone = _zone; - break; - } - } + /* Hot Path: Validate the zone candidate selected pre-lock. + * The size comparison already happened outside the critical + * section; only the shared zone struct access (is_zone_usable) + * needs to be under the lock. */ + if(cached_zone != NULL && is_zone_usable(cached_zone, size) != NULL) { + zone = cached_zone; } } @@ -2150,10 +2160,11 @@ INTERNAL_HIDDEN void _iso_alloc_destroy(void) { #endif for(uint16_t i = 0; i < zones_used; i++) { - iso_alloc_zone_t *zone = &_root->zones[i]; - _verify_zone(zone); +#if DEBUG || FUZZ_MODE + _verify_zone(&_root->zones[i]); +#endif #if ISO_DTOR_CLEANUP - _iso_alloc_destroy_zone_unlocked(zone, false, false); + _iso_alloc_destroy_zone_unlocked(&_root->zones[i], false, false); #endif } diff --git a/src/iso_alloc_sanity.c b/src/iso_alloc_sanity.c index 752d506..abe8e76 100644 --- a/src/iso_alloc_sanity.c +++ b/src/iso_alloc_sanity.c @@ -415,6 +415,15 @@ INTERNAL_HIDDEN INLINE void *__iso_memcpy(void *restrict dest, const void *restr char *p_dest = (char *) dest; char const *p_src = (char const *) src; +#if USE_NEON + while(n >= 16) { + vst1q_u8((uint8_t *) p_dest, vld1q_u8((const uint8_t *) p_src)); + p_dest += 16; + p_src += 16; + n -= 16; + } +#endif + while(n--) { *p_dest++ = *p_src++; } @@ -462,8 +471,19 @@ INTERNAL_HIDDEN INLINE void *__iso_memmove(void *dest, const void *src, size_t n } if(p_src < p_dest) { + /* Overlapping: copy backwards to avoid clobbering src */ p_dest += n; p_src += n; + +#if USE_NEON + while(n >= 16) { + p_dest -= 16; + p_src -= 16; + vst1q_u8((uint8_t *) p_dest, vld1q_u8((const uint8_t *) p_src)); + n -= 16; + } +#endif + while(n--) { *--p_dest = *--p_src; } @@ -508,6 +528,15 @@ INTERNAL_HIDDEN INLINE void *__iso_memset(void *dest, int b, size_t n) { #if MEMSET_SANITY char *p_dest = (char *) dest; +#if USE_NEON + uint8x16_t vec = vdupq_n_u8((uint8_t) b); + while(n >= 16) { + vst1q_u8((uint8_t *) p_dest, vec); + p_dest += 16; + n -= 16; + } +#endif + while(n--) { *p_dest++ = b; } diff --git a/tests/bzero_sanity.c b/tests/bzero_sanity.c index 6b461fc..460f905 100644 --- a/tests/bzero_sanity.c +++ b/tests/bzero_sanity.c @@ -8,10 +8,7 @@ #error "This test intended to be run with -DMEMSET_SANITY=1" #endif -#if !(__FreeBSD__ || __NetBSD__ || __OpenBSD__ || __DragonFly__) -#error "This test intended for BSD systems" -#endif - +#if(__FreeBSD__ || __NetBSD__ || __OpenBSD__ || __DragonFly__) int main(int argc, char *argv[]) { uint8_t *p = NULL; @@ -27,3 +24,5 @@ int main(int argc, char *argv[]) { return OK; } +#endif +int main(void) {} From b6276f842124f870398db27a5beeb1aa5b0f6a80 Mon Sep 17 00:00:00 2001 From: struct Date: Sun, 15 Mar 2026 15:56:32 -0400 Subject: [PATCH 2/2] performance and bug fixes --- Makefile | 13 +++++++++++ PERFORMANCE.md | 44 +++++++++++++++++++----------------- include/iso_alloc_internal.h | 6 ++--- src/iso_alloc.c | 33 +++++++++++++-------------- src/iso_alloc_interfaces.c | 6 ++--- src/iso_alloc_util.c | 16 +++++++++++++ 6 files changed, 74 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index d635553..5a474fe 100644 --- a/Makefile +++ b/Makefile @@ -361,6 +361,19 @@ library_less_strict: clean $(CC) $(CFLAGS) $(LIBRARY) $(OPTIMIZE) $(OS_FLAGS) $(C_SRCS) -o $(BUILD_DIR)/$(LIBNAME) $(STRIP) +## Build a performance-optimized library with the most expensive security +## features disabled. Intended for benchmarking and performance measurement. +## All other flags inherit from the top-level defaults. +library_benchmark: DISABLE_CANARY = -DDISABLE_CANARY=1 +library_benchmark: PRE_POPULATE_PAGES = -DPRE_POPULATE_PAGES=1 +library_benchmark: RANDOMIZE_FREELIST = -DRANDOMIZE_FREELIST=0 +library_benchmark: MASK_PTRS = -DMASK_PTRS=0 +library_benchmark: ABORT_ON_UNOWNED_PTR = -DABORT_ON_UNOWNED_PTR=0 +library_benchmark: clean + @echo "make library_benchmark" + $(CC) $(CFLAGS) $(LIBRARY) $(OPTIMIZE) $(OS_FLAGS) $(C_SRCS) -o $(BUILD_DIR)/$(LIBNAME) + $(STRIP) + ## Build a debug version of the library library_debug: clean @echo "make library debug" diff --git a/PERFORMANCE.md b/PERFORMANCE.md index f6a8626..0142ac8 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -153,35 +153,37 @@ iso_realloc/iso_free 1834784 tests completed in 0.901481 seconds The following benchmarks were collected from [mimalloc-bench](https://github.com/daanx/mimalloc-bench) with the default configuration of IsoAlloc. As you can see from the data IsoAlloc is competitive with other allocators for some benchmarks but clearly falls behind on others. For any benchmark that IsoAlloc scores poorly on I was able to tweak its build to improve the CPU time and memory consumption. It's worth noting that IsoAlloc was able to stay competitive even with performing many security checks not present in other allocators. Please note these are 'best case' measurements, not averages. ``` +make library_benchmark + #------------------------------------------------------------------ # test alloc time rss user sys page-faults page-reclaims -cfrac je 03.07 4560 3.06 0.00 0 455 -cfrac mi 02.92 2676 2.92 0.00 0 348 -cfrac iso 05.16 30764 5.08 0.08 0 7497 +cfrac je 03.07 4552 3.06 0.00 0 454 +cfrac mi 02.97 2484 2.96 0.00 0 347 +cfrac iso 04.78 30612 4.69 0.09 0 7503 -espresso je 02.49 5032 2.48 0.00 0 550 -espresso mi 02.47 3004 2.45 0.01 0 3636 -espresso iso 03.25 69124 3.16 0.09 0 30105 +espresso je 02.51 4872 2.50 0.01 0 540 +espresso mi 02.43 3032 2.42 0.01 0 3630 +espresso iso 03.16 69608 3.07 0.07 0 30334 -barnes je 01.71 59916 1.68 0.02 0 16684 -barnes mi 01.64 57864 1.61 0.02 0 16550 -barnes iso 01.65 74968 1.61 0.03 0 20851 +barnes je 01.71 59900 1.67 0.03 0 16686 +barnes mi 01.65 57672 1.62 0.02 0 16550 +barnes iso 01.65 74812 1.62 0.03 0 20849 -gs je 00.15 37756 0.13 0.01 0 5812 -gs mi 00.15 33668 0.14 0.01 0 5110 -gs iso 00.23 67960 0.16 0.06 0 18846 +gs je 00.17 37748 0.15 0.01 0 5814 +gs mi 00.16 33888 0.14 0.01 0 5109 +gs iso 00.22 68136 0.15 0.06 0 18916 -larsonN je 1.153 269184 98.81 1.00 0 419378 -larsonN mi 1.037 301044 99.34 0.41 0 83267 -larsonN iso 1304.061 121072 6.10 70.16 0 30031 +larsonN je 1.188 261884 98.91 0.92 0 421848 +larsonN mi 1.016 299752 99.53 0.38 0 80202 +larsonN iso 1328.904 121096 6.15 69.78 0 30219 -rocksdb je 02.49 162976 2.09 0.60 0 38215 -rocksdb mi 02.22 160392 1.86 0.54 0 37563 -rocksdb iso 02.87 197548 2.58 0.59 0 46899 +rocksdb je 02.46 162340 2.05 0.63 0 38383 +rocksdb mi 02.33 160156 1.92 0.63 0 37585 +rocksdb iso 02.96 195948 2.64 0.66 0 46584 -redis je 3.319 9484 0.14 0.02 0 1540 -redis mi 2.840 7124 0.12 0.02 0 1254 -redis iso 7.340 49712 0.34 0.04 0 14959 +redis je 3.160 9492 0.13 0.02 0 1528 +redis mi 2.780 7084 0.12 0.02 0 1257 +redis iso 7.579 50516 0.35 0.05 0 15187 ``` IsoAlloc isn't quite ready for performance sensitive server workloads. However it's more than fast enough for client side mobile/desktop applications with risky C/C++ attack surfaces. These environments have threat models similar to what IsoAlloc was designed for. diff --git a/include/iso_alloc_internal.h b/include/iso_alloc_internal.h index 1f9fa1a..647f4d7 100644 --- a/include/iso_alloc_internal.h +++ b/include/iso_alloc_internal.h @@ -376,7 +376,7 @@ INTERNAL_HIDDEN INLINE void populate_zone_cache(iso_alloc_zone_t *zone); INTERNAL_HIDDEN INLINE void flush_chunk_quarantine(void); INTERNAL_HIDDEN INLINE void clear_zone_cache(void); INTERNAL_HIDDEN iso_alloc_big_zone_t *iso_find_big_zone(void *p, bool remove); -INTERNAL_HIDDEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size); +INTERNAL_HIDDEN FLATTEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size); INTERNAL_HIDDEN iso_alloc_zone_t *find_suitable_zone(size_t size); INTERNAL_HIDDEN iso_alloc_zone_t *iso_new_zone(size_t size, bool internal); INTERNAL_HIDDEN iso_alloc_zone_t *_iso_new_zone(size_t size, bool internal, int32_t index); @@ -385,7 +385,7 @@ INTERNAL_HIDDEN iso_alloc_zone_t *iso_find_zone_range(void *p); INTERNAL_HIDDEN iso_alloc_zone_t *search_chunk_lookup_table(const void *p); INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone); INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot(iso_alloc_zone_t *zone); -INTERNAL_HIDDEN bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone); +INTERNAL_HIDDEN INLINE bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone); INTERNAL_HIDDEN iso_alloc_root *iso_alloc_new_root(void); INTERNAL_HIDDEN bool is_pow2(uint64_t sz); INTERNAL_HIDDEN bool _is_zone_retired(iso_alloc_zone_t *zone); @@ -416,7 +416,7 @@ INTERNAL_HIDDEN void *_untag_ptr(void *p, iso_alloc_zone_t *zone); INTERNAL_HIDDEN void _free_big_zone_list(iso_alloc_big_zone_t *head); INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_big_alloc(size_t size); INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t size); -INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone); +INTERNAL_HIDDEN INLINE ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone); INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_calloc(size_t nmemb, size_t size); INTERNAL_HIDDEN void *_iso_alloc_ptr_search(void *n, bool poison); INTERNAL_HIDDEN INLINE uint64_t us_rand_uint64(uint64_t *seed); diff --git a/src/iso_alloc.c b/src/iso_alloc.c index c8177d4..0a1d7ff 100644 --- a/src/iso_alloc.c +++ b/src/iso_alloc.c @@ -577,7 +577,7 @@ INTERNAL_HIDDEN void fill_free_bit_slots(iso_alloc_zone_t *zone) { } bit_slot_t *free_bit_slots = zone->free_bit_slots; - __iso_memset(free_bit_slots, BAD_BIT_SLOT, ZONE_FREE_LIST_SZ); + __iso_memset(free_bit_slots, BAD_BIT_SLOT, sizeof(zone->free_bit_slots)); zone->free_bit_slots_usable = 0; free_bit_slot_t free_bit_slots_index; @@ -668,9 +668,9 @@ INTERNAL_HIDDEN INLINE void insert_free_bit_slot(iso_alloc_zone_t *zone, int64_t zone->is_full = false; } -INTERNAL_HIDDEN bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone) { +INTERNAL_HIDDEN INLINE bit_slot_t get_next_free_bit_slot(iso_alloc_zone_t *zone) { if(zone->free_bit_slots_usable >= ZONE_FREE_LIST_SZ || - zone->free_bit_slots_usable > zone->free_bit_slots_index) { + zone->free_bit_slots_usable >= zone->free_bit_slots_index) { return BAD_BIT_SLOT; } @@ -783,7 +783,7 @@ INTERNAL_HIDDEN bit_slot_t iso_scan_zone_free_slot_slow(iso_alloc_zone_t *zone) return BAD_BIT_SLOT; } -INTERNAL_HIDDEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size) { +INTERNAL_HIDDEN FLATTEN iso_alloc_zone_t *is_zone_usable(iso_alloc_zone_t *zone, size_t size) { #if CPU_PIN if(zone->cpu_core != _iso_getcpu()) { return false; @@ -947,7 +947,7 @@ INTERNAL_HIDDEN iso_alloc_zone_t *find_suitable_zone(size_t size) { return NULL; } -INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone) { +INTERNAL_HIDDEN INLINE ASSUME_ALIGNED void *_iso_alloc_bitslot_from_zone(bit_slot_t bitslot, iso_alloc_zone_t *zone) { const bitmap_index_t dwords_to_bit_slot = (bitslot >> BITS_PER_QWORD_SHIFT); const int64_t which_bit = WHICH_BIT(bitslot); @@ -1019,7 +1019,7 @@ INTERNAL_HIDDEN INLINE void populate_zone_cache(iso_alloc_zone_t *zone) { _zone_cache_count++; } else { /* Evict oldest entry (index 0) via FIFO: shift all entries down by one */ - memmove(&tzc[0], &tzc[1], (ZONE_CACHE_SZ - 1) * sizeof(_tzc)); + __iso_memmove(&tzc[0], &tzc[1], (ZONE_CACHE_SZ - 1) * sizeof(_tzc)); _zone_cache_count = ZONE_CACHE_SZ - 1; tzc[_zone_cache_count].zone = zone; tzc[_zone_cache_count].chunk_size = zone->chunk_size; @@ -1030,7 +1030,7 @@ INTERNAL_HIDDEN INLINE void populate_zone_cache(iso_alloc_zone_t *zone) { } INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_calloc(size_t nmemb, size_t size) { - unsigned int res; + size_t res; if(size < SMALLEST_CHUNK_SZ) { size = SMALLEST_CHUNK_SZ; @@ -1038,24 +1038,22 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_calloc(size_t nmemb, size_t size) { size = ALIGN_SZ_UP(size); } - size_t sz = nmemb * size; - - if(sz < size || UNLIKELY(__builtin_mul_overflow(nmemb, size, &res))) { - LOG("Call to calloc() will overflow nmemb=%d size=%d = %u", nmemb, size, nmemb * size); + if(UNLIKELY(__builtin_mul_overflow(nmemb, size, &res)) || UNLIKELY(res > BIG_SZ_MAX)) { + LOG("Call to calloc() will overflow nmemb=%zu size=%zu", nmemb, size); return NULL; } - void *p = _iso_alloc(NULL, sz); + void *p = _iso_alloc(NULL, res); #if NO_ZERO_ALLOCATIONS /* Without this check we would immediately segfault in * the call to __iso_memset() to zeroize the chunk */ - if(UNLIKELY(sz == 0)) { + if(UNLIKELY(res == 0)) { return p; } #endif - __iso_memset(p, 0x0, sz); + __iso_memset(p, 0x0, res); return p; } @@ -1120,8 +1118,8 @@ INTERNAL_HIDDEN ASSUME_ALIGNED void *_iso_alloc(iso_alloc_zone_t *zone, size_t s } #if ALLOC_SANITY - /* We don't sample if we are allocating from a private zone */ - if(zone != NULL && zone->internal == true) { + /* We only sample if a zone was not directly passed */ + if(zone != NULL) { if(size < g_page_size && _sane_sampled < MAX_SANE_SAMPLES) { /* If we chose to sample this allocation then * _iso_alloc_sample will call UNLOCK_ROOT() */ @@ -1760,7 +1758,8 @@ INTERNAL_HIDDEN iso_alloc_big_zone_t *iso_find_big_zone(void *p, bool remove) { LOCK_BIG_ZONE_USED(); if(_root->big_zone_used == NULL) { - LOG_AND_ABORT("There are no big zones allocated"); + UNLOCK_BIG_ZONE_USED(); + return NULL; } iso_alloc_big_zone_t *big_zone = _root->big_zone_used; diff --git a/src/iso_alloc_interfaces.c b/src/iso_alloc_interfaces.c index 65d4c9b..f87afcb 100644 --- a/src/iso_alloc_interfaces.c +++ b/src/iso_alloc_interfaces.c @@ -95,13 +95,13 @@ EXTERNAL_API FLATTEN NO_DISCARD REALLOC_SIZE ASSUME_ALIGNED void *iso_realloc(vo } EXTERNAL_API FLATTEN NO_DISCARD MALLOC_ATTR REALLOC_SIZE ASSUME_ALIGNED void *iso_reallocarray(void *p, size_t nmemb, size_t size) { - unsigned int res; + size_t res; - if(__builtin_mul_overflow(nmemb, size, &res)) { + if(__builtin_mul_overflow(nmemb, size, &res) || res > BIG_SZ_MAX) { return NULL; } - return iso_realloc(p, nmemb * size); + return iso_realloc(p, res); } EXTERNAL_API FLATTEN NO_DISCARD ASSUME_ALIGNED char *iso_strdup(const char *str) { diff --git a/src/iso_alloc_util.c b/src/iso_alloc_util.c index 132780e..fadd5a3 100644 --- a/src/iso_alloc_util.c +++ b/src/iso_alloc_util.c @@ -153,6 +153,22 @@ void *mmap_pages(size_t size, bool populate, const char *name, int32_t prot) { p = mmap(p, sz, prot, flags, fd, 0); +#if __linux__ && MAP_HUGETLB && HUGE_PAGES + /* If the huge page allocation failed, retry with regular pages. + * This can happen when /proc/sys/vm/nr_hugepages is 0 or + * exhausted, which is common in LD_PRELOAD environments. */ + if(p == MAP_FAILED && (flags & MAP_HUGETLB)) { + flags &= ~MAP_HUGETLB; + p = mmap(p, sz, prot, flags, fd, 0); + } +#elif __APPLE__ && VM_FLAGS_SUPERPAGE_SIZE_2MB && HUGE_PAGES + /* Same fallback for macOS superpage allocations */ + if(p == MAP_FAILED && fd == VM_FLAGS_SUPERPAGE_SIZE_2MB) { + fd = -1; + p = mmap(p, sz, prot, flags, fd, 0); + } +#endif + if(p == MAP_FAILED) { LOG_AND_ABORT("Failed to mmap rw pages"); }