From c92838aa345c75a5116bd54080b01ee93fac60be Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 25 May 2026 16:02:06 +0100 Subject: [PATCH 01/15] Fix double base-adjust in FlatPagemap::get_mut get_mut base-adjusted p before calling register_range, which then re-applied the base subtraction internally and tripped its out-of-range guard for legitimate in-range addresses. The path is reachable on PALs without LazyCommit (e.g. PALNoAlloc) when get/get_mut is called on an in-range address of a bounded pagemap. Move the register_range call before the p = p - base adjust so it sees the un-adjusted address that its bounds check expects. Add a regression test in func-pagemap that wraps DefaultPal with a stub stripping LazyCommit; this exercises the previously-broken path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds/pagemap.h | 9 +++++++- src/test/func/pagemap/pagemap.cc | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/snmalloc/ds/pagemap.h b/src/snmalloc/ds/pagemap.h index 2ee3cdd29..983b82e83 100644 --- a/src/snmalloc/ds/pagemap.h +++ b/src/snmalloc/ds/pagemap.h @@ -343,17 +343,24 @@ namespace snmalloc PAL::error("Internal error: Pagemap read access out of range."); } } - p = p - base; } // If this is potentially_out_of_range, then the pages will not have // been mapped. With Lazy commit they will at least be mapped read-only // Note that: this means external pointer on Windows will be slow. + // register_range takes an unadjusted address: it does its own + // base-relative arithmetic when has_bounds, so it must be called + // before the p = p - base adjustment below. if constexpr (potentially_out_of_range && !pal_supports) { register_range(p, 1); } + if constexpr (has_bounds) + { + p = p - base; + } + if constexpr (potentially_out_of_range) return body_opt[p >> SHIFT]; else diff --git a/src/test/func/pagemap/pagemap.cc b/src/test/func/pagemap/pagemap.cc index 7a03fa1a7..f93f64840 100644 --- a/src/test/func/pagemap/pagemap.cc +++ b/src/test/func/pagemap/pagemap.cc @@ -14,6 +14,17 @@ using namespace snmalloc; static constexpr size_t GRANULARITY_BITS = 20; +/** + * Test PAL that wraps DefaultPal but strips LazyCommit from pal_features. + * Used to exercise the get code path that calls register_range on + * a bounded pagemap — see test_get_potentially_out_of_range_bounded below. + */ +struct NoLazyCommitPal : public DefaultPal +{ + static constexpr uint64_t pal_features = + DefaultPal::pal_features & ~static_cast(LazyCommit); +}; + struct T { size_t v = 99; @@ -27,6 +38,9 @@ FlatPagemap pagemap_test_unbound; FlatPagemap pagemap_test_bound; +FlatPagemap + pagemap_test_bound_no_lazy; + size_t failure_count = 0; void check_get( @@ -158,6 +172,30 @@ int main(int argc, char** argv) test_pagemap(false); test_pagemap(true); + // Regression test for the bounded + !LazyCommit path of get. + // Previously, get_mut base-adjusted p before calling register_range, + // which double-subtracted base inside register_range and tripped the + // out-of-range guard for legitimate in-range addresses. + { + auto size = bits::one_at_bit(GRANULARITY_BITS + 4); + auto* base = NoLazyCommitPal::reserve(size); + NoLazyCommitPal::notify_using(base, size); + auto [heap_base, heap_size] = pagemap_test_bound_no_lazy.init(base, size); + auto low = address_cast(heap_base); + + pagemap_test_bound_no_lazy.set(low, T(7)); + + // get with has_bounds && !LazyCommit must not error on an in-range + // address: the underlying register_range call sees a fully-adjusted base. + T value = pagemap_test_bound_no_lazy.get(low); + if (value.v != 7) + { + std::cout << "get bounded !LazyCommit: read " << value.v + << " expected 7" << std::endl; + failure_count++; + } + } + if (failure_count != 0) { std::cout << "Failure count: " << failure_count << std::endl; From eebd3d2531d093763ccc93d63e6e4805f75870d2 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 12 Jun 2026 14:16:28 +0100 Subject: [PATCH 02/15] Fix compile-time aligned alloc/dealloc asymmetry snmalloc::alloc() applies aligned_size(align, size) internally; snmalloc::dealloc(p) did not. When the alignment upgrade pushed the reservation into a different sizeclass than `size`, check_size fired under the check flavour. Reproducer: alloc<33*1024, _, 128*1024>(); dealloc<33*1024>(p) => "Dealloc rounded size mismatch: 0xa000 != 0x20000". Merge dealloc into a single template `dealloc` applying aligned_size(align, size) before check_size. The default align=1 preserves existing one-argument-template behaviour because aligned_size(1, size) == size. Move aligned_size from sizeclasstable.h to sizeclassstatic.h so the test library header can use it without pulling in the full runtime sizeclass machinery. Existing consumers still get it transitively via the pal.h -> ds_core.h -> sizeclassstatic.h include chain. Mirror the merge in the test library header: dealloc and alloc. Add aligned_dealloc to TESTLIB_ONLY_TESTS. Includes src/test/func/aligned_dealloc/ with the canonical reproducer and additional (S, A) pairs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 2 +- src/snmalloc/ds/sizeclasstable.h | 37 -------- src/snmalloc/ds_core/sizeclassstatic.h | 47 ++++++++++ src/snmalloc/global/globalalloc.h | 11 ++- .../func/aligned_dealloc/aligned_dealloc.cc | 90 +++++++++++++++++++ src/test/snmalloc_testlib.h | 21 +++-- 6 files changed, 162 insertions(+), 46 deletions(-) create mode 100644 src/test/func/aligned_dealloc/aligned_dealloc.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index f49447a8a..a349e9727 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS - bits first_operation memory memory_usage multi_atexit multi_threadatexit + aligned_dealloc bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown singlethread startup diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 5db3cb5fa..80b6c9211 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -510,41 +510,4 @@ namespace snmalloc return 1; return bits::one_at_bit(bits::ctz(rsize)); } - - constexpr SNMALLOC_FAST_PATH static size_t - aligned_size(size_t alignment, size_t size) - { - // Client responsible for checking alignment is not zero - SNMALLOC_ASSERT(alignment != 0); - // Client responsible for checking alignment is a power of two - SNMALLOC_ASSERT(bits::is_pow2(alignment)); - - // There are a class of corner cases to consider - // alignment = 0x8 - // size = 0xfff...fff7 - // for this result will be 0. This should fail an allocation, so we need to - // check for this overflow. - // However, - // alignment = 0x8 - // size = 0x0 - // will also result in 0, but this should be allowed to allocate. - // So we need to check for overflow, and return SIZE_MAX in this first case, - // and 0 in the second. - size_t result = ((alignment - 1) | (size - 1)) + 1; - // The following code is designed to fuse well with a subsequent - // sizeclass calculation. We use the same fast path constant to - // move the case where result==0 to the slow path, and then check for which - // case we are in. - if (is_small_sizeclass(result)) - return result; - - // We are in the slow path, so we need to check for overflow. - if (SNMALLOC_UNLIKELY(result == 0)) - { - // Check for overflow and return the maximum size. - if (SNMALLOC_UNLIKELY(result < size)) - return SIZE_MAX; - } - return result; - } } // namespace snmalloc diff --git a/src/snmalloc/ds_core/sizeclassstatic.h b/src/snmalloc/ds_core/sizeclassstatic.h index 011f69830..cf66851bc 100644 --- a/src/snmalloc/ds_core/sizeclassstatic.h +++ b/src/snmalloc/ds_core/sizeclassstatic.h @@ -74,4 +74,51 @@ namespace snmalloc return (size - 1) < sizeclass_to_size_const(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1)); } + + /** + * @brief Round `size` up so the resulting allocation can satisfy + * the requested `alignment`. `alignment` must be a non-zero power + * of two. + * + * Lives in sizeclassstatic.h (not sizeclasstable.h) so it is + * available to compile-time-only consumers — notably the test + * library header — without pulling in the full runtime sizeclass + * machinery. + */ + constexpr SNMALLOC_FAST_PATH size_t + aligned_size(size_t alignment, size_t size) + { + // Client responsible for checking alignment is not zero + SNMALLOC_ASSERT(alignment != 0); + // Client responsible for checking alignment is a power of two + SNMALLOC_ASSERT(bits::is_pow2(alignment)); + + // There are a class of corner cases to consider + // alignment = 0x8 + // size = 0xfff...fff7 + // for this result will be 0. This should fail an allocation, so we need to + // check for this overflow. + // However, + // alignment = 0x8 + // size = 0x0 + // will also result in 0, but this should be allowed to allocate. + // So we need to check for overflow, and return SIZE_MAX in this first case, + // and 0 in the second. + size_t result = ((alignment - 1) | (size - 1)) + 1; + // The following code is designed to fuse well with a subsequent + // sizeclass calculation. We use the same fast path constant to + // move the case where result==0 to the slow path, and then check for which + // case we are in. + if (is_small_sizeclass(result)) + return result; + + // We are in the slow path, so we need to check for overflow. + if (SNMALLOC_UNLIKELY(result == 0)) + { + // Check for overflow and return the maximum size. + if (SNMALLOC_UNLIKELY(result < size)) + return SIZE_MAX; + } + return result; + } } // namespace snmalloc diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index 7607e582a..5b428e2f1 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -380,10 +380,17 @@ namespace snmalloc ThreadAlloc::get().dealloc(p); } - template + /** + * Compile-time sized dealloc. The optional `align` parameter mirrors + * the `align` parameter on `alloc` so the + * sized-dealloc sanity check sees the size that was actually + * reserved (post `aligned_size`), not the raw requested `size`. + */ + template SNMALLOC_FAST_PATH_INLINE void dealloc(void* p) { - check_size(p, size); + constexpr size_t sz = aligned_size(align, size); + check_size(p, sz); ThreadAlloc::get().dealloc(p); } diff --git a/src/test/func/aligned_dealloc/aligned_dealloc.cc b/src/test/func/aligned_dealloc/aligned_dealloc.cc new file mode 100644 index 000000000..51646e39c --- /dev/null +++ b/src/test/func/aligned_dealloc/aligned_dealloc.cc @@ -0,0 +1,90 @@ +/** + * Regression test for the compile-time aligned alloc/dealloc API. + * + * `snmalloc::alloc()` applies + * `aligned_size(align, size)` internally so the underlying reservation + * is large enough to satisfy `align`. The matching + * `snmalloc::dealloc(p)` overload mirrors that: it applies + * the same `aligned_size` before `check_size`, so the size fed to the + * sized-dealloc sanity check is the size that was actually reserved. + * + * Without the aligned dealloc overload, callers either had to use the + * unsized `dealloc(p)` or manually pass `dealloc(p)`. Calling `dealloc(p)` instead trips `check_size` + * under `mitigations(sanity_checks)` whenever the alignment upgrade + * pushes the reservation into a different sizeclass than `size` + * itself (e.g. `S = 33 KiB`, `A = 128 KiB`: the reservation lives in + * a 128 KiB sizeclass but `check_size` evaluates + * `size_to_sizeclass_full(33 KiB)`, a smaller class). + */ + +#include "test/setup.h" +#include "test/snmalloc_testlib.h" + +#include + +using namespace snmalloc; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + template + void check_round_trip(const char* label) + { + void* p = snmalloc::alloc(); + if (p == nullptr) + { + fail(label); + return; + } + constexpr size_t reserved = aligned_size(align, size); + if (alloc_size(p) < reserved) + { + std::cout << " reservation too small: alloc_size=" << alloc_size(p) + << " expected>=" << reserved << std::endl; + fail(label); + return; + } + snmalloc::dealloc(p); + } +} // namespace + +int main(int, char**) +{ + setup(); + + // The canonical pre-existing reproducer: today's pow2 rounding maps + // 33 KiB to one large sizeclass while the alignment-driven + // reservation lands in a strictly larger one. + check_round_trip<33 * 1024, 128 * 1024>("S=33KiB A=128KiB"); + + // Small-to-large alignment upgrade. + check_round_trip<48, 64 * 1024>("S=48B A=64KiB"); + + // Wider gap between requested size and required alignment. + check_round_trip<17 * 1024, 256 * 1024>("S=17KiB A=256KiB"); + + // align == size: alloc and dealloc sees the same value pre- and + // post-aligned_size; serves as a baseline that the overload + // doesn't pessimise the simple case. + check_round_trip<64 * 1024, 64 * 1024>("S=64KiB A=64KiB"); + + // Small allocation, natural alignment. + check_round_trip<32, 32>("S=32B A=32B"); + + if (any_failures) + { + std::cout << "aligned_dealloc test FAILED" << std::endl; + return 1; + } + + std::cout << "aligned_dealloc test passed" << std::endl; + return 0; +} diff --git a/src/test/snmalloc_testlib.h b/src/test/snmalloc_testlib.h index 5b51ff7bd..00b0513e4 100644 --- a/src/test/snmalloc_testlib.h +++ b/src/test/snmalloc_testlib.h @@ -41,10 +41,18 @@ namespace snmalloc void dealloc(void* p, size_t size); void dealloc(void* p, size_t size, size_t align); - template + /** + * Compile-time sized dealloc with optional alignment. + * + * The `align` parameter mirrors the `align` parameter on the + * `alloc` overload below: it is applied via + * `aligned_size` so the size fed to the sized-dealloc sanity check + * matches the size that was actually reserved. + */ + template inline void dealloc(void* p) { - dealloc(p, size); + dealloc(p, aligned_size(align, size)); } void debug_teardown(); @@ -115,12 +123,13 @@ namespace snmalloc * goes straight to the sizeclass-based fast path. Otherwise falls back * to the dynamic alloc. */ - template + template inline void* alloc() { - if constexpr (is_small_sizeclass(size)) + constexpr size_t sz = aligned_size(align, size); + if constexpr (is_small_sizeclass(sz)) { - constexpr auto sc = size_to_sizeclass_const(size); + constexpr auto sc = size_to_sizeclass_const(sz); if constexpr (zero_mem == ZeroMem::YesZero) { return libc::malloc_small_zero(sc); @@ -132,7 +141,7 @@ namespace snmalloc } else { - return alloc(size); + return alloc(sz); } } } // namespace snmalloc From 5a5a0c1b6f412c09566995d2cc6dedf0a0a19b54 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 20 May 2026 11:08:12 +0100 Subject: [PATCH 03/15] Add ArenaBins: bin scheme, per-sc tables, and bitmap Introduces src/snmalloc/backend_helpers/arenabins.h, which owns the chunk-unit size-class scheme and the non-empty-bins bitmap that later commits will use to drive bin selection inside Arena. Public surface (the integration contract for later commits): * range_t, carve_t, carve(block, n_chunks), max_supported_chunks(). * Nested Bitmap with add(block), find_for_request(n_chunks), clear(bin_id), and TOTAL_BINS. Everything else (the size-class encoding, the per-SC tables, the free-side classifier bin_index) is private. The unit test reaches it via a friend struct ArenaBinsTestAccess that is only forward-declared in the header and defined in the test translation unit, so the production header carries no test-only surface. Implementation: * Two power-of-two-sized rodata tables indexed by raw sc id with shift+add. bitmap_info_t (4 words via alignas) feeds Bitmap::find_for_request; carve_info_t (2 words) feeds carve and the free-side cascade-fit predicate. * bitmap_info_t fields (start_word, first_mask, second_mask) are pre-shifted into the bitmap's word layout so find_for_request is two ANDs on the hot word + word-boundary fall-through. * Tables are populated at constexpr build time by BinTable() consuming the canonical bin_subsets table; the strict-chain invariant on bin_subsets is checked at compile time via throw in the constexpr constructor. * Fast path uses the runtime CLZ intrinsic via the new bits::to_exp_mant (paired with the existing to_exp_mant_const); the _const variant is restricted to constexpr table construction and test static_asserts. bits::prev_pow2_bits / prev_pow2_bits_const are added alongside for symmetric runtime / constexpr access. The new test cross-checks bin classification, carve, and find_for_request against a brute-force scanner derived directly from bin_subsets, for B in {1, 2, 3}. Exhaustive single-bit and multi-bit randomised bitmap states are covered, plus word-boundary straddle cases enumerated automatically from the table. No production code path is changed: ArenaBins is unused in the build until later commits compose it into Arena. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 3 +- src/snmalloc/backend_helpers/arenabins.h | 756 ++++++++++++++ src/snmalloc/ds_core/bits.h | 44 + src/test/func/arenabins/arenabins.cc | 1220 ++++++++++++++++++++++ 4 files changed, 2022 insertions(+), 1 deletion(-) create mode 100644 src/snmalloc/backend_helpers/arenabins.h create mode 100644 src/test/func/arenabins/arenabins.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a349e9727..009efebb3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,8 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS - aligned_dealloc bits first_operation memory memory_usage multi_atexit multi_threadatexit + aligned_dealloc arenabins + bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown singlethread startup diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h new file mode 100644 index 000000000..a213650ed --- /dev/null +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -0,0 +1,756 @@ +#pragma once + +#include "../ds_core/bits.h" +#include "../ds_core/helpers.h" + +#include + +namespace snmalloc +{ + template + struct ArenaBinsTestAccess; + + /** + * Chunk size class enumeration and bin classification used by the + * Arena. + * + * Template parameter B (mantissa-bit width of snmalloc's + * non-power-of-two size class scheme) determines the number of + * RB-trees per exponent — the count of distinct servable subsets a + * free block can occupy at that exponent: B=1 -> 2; B=2 -> 5; + * B=3 -> 13. The canonical within-exponent bin numbering matches + * `prototype/skip_analysis.py`. All bin-scheme metadata derives + * constexpr from a single per-bin subsets table, `bin_subsets`. + * + * Public surface: + * - `range_t`, `carve_t`: chunk-count ranges and carve output. + * - `carve(block, n_chunks)`: split a block into pre-pad / aligned + * request / post-pad. + * - `max_supported_chunks()`: upper bound on legal request sizes. + * - nested `Bitmap`: per-arena non-empty-bins bitmap with + * `add` / `find_for_request` / `clear`. + * + * Everything else is private; tests reach it via + * `ArenaBinsTestAccess`. + */ + template + class ArenaBins + { + static_assert( + INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, + "ArenaBins currently supports B in {1, 2, 3}"); + + public: + /// (base, size) chunk-count range. `size == 0` means empty (base + /// is unspecified). + struct range_t + { + size_t base; + size_t size; + }; + + /// Output of `carve`: pre-pad / aligned request / post-pad. + /// Either or both of `pre`/`post` may be empty. + struct carve_t + { + range_t pre; + range_t req; + range_t post; + }; + + private: + friend struct ArenaBinsTestAccess; + + static constexpr size_t B = INTERMEDIATE_BITS; + + /// Number of mantissa positions per regular exponent (= 2^B). + static constexpr size_t MANTISSAS_PER_EXP = size_t(1) << B; + + /// Number of distinct servable-subset bins per exponent + /// (from prototype/skip_analysis.py). + static constexpr size_t BINS_PER_EXP = (B == 1) ? 2 : + (B == 2) ? 5 : + (B == 3) ? 13 : + 0; + + /// Size of the per-sc info tables. One past the largest raw id from + /// `bits::to_exp_mant_const` whose decoded size fits in + /// `size_t` (the architectural max raw id decodes to `2^bits::BITS`, + /// which overflows). + static constexpr size_t MAX_SC = + ((bits::BITS - B) << B) + ((size_t(1) << B) - 1); + + /** + * Per-SC bitmap-scan record, read by `Bitmap::find_for_request`. + * Fields are pre-shifted into the bitmap's word layout so the + * search hot path is two ANDs. + * + * - `start_word`: bitmap word containing this SC's start bin. + * - `first_mask`: serve mask pre-shifted into `start_word`. Bit + * `i` set iff `words_[start_word]` bit `i` serves this SC. + * - `second_mask`: serve mask carried into `start_word + 1`. When + * the start bin is word-aligned there is no within-exp carry + * and bits there are all higher-exponent, so `second_mask == ~0`. + * + * `alignas(4 * sizeof(size_t))` rounds `sizeof(bitmap_info_t)` up + * to a power of two so `table_.bitmap_info[sc]` indexes with a + * single shift+add. + * + * A *bin* (single bit in `Bitmap`) has no size/alignment of its + * own; it may be set on behalf of any SC whose subset includes it. + */ + struct alignas(4 * sizeof(size_t)) bitmap_info_t + { + size_t start_word; + size_t first_mask; + size_t second_mask; + }; + + static_assert( + sizeof(bitmap_info_t) == 4 * sizeof(size_t), + "bitmap_info_t must be 4*size_t so table_.bitmap_info[sc] indexes " + "with a single shift+add; revisit the alignas if fields change"); + + /** + * Per-SC carve record, read by `carve` and by `bin_offset_at`'s + * `fits` predicate (free-side cascade walk via `bin_index`). + * + * - `size_chunks`: size this SC promises on allocation. + * - `align_chunks`: natural alignment (a power of two, derived + * from `size_chunks`). + */ + struct carve_info_t + { + size_t size_chunks; + size_t align_chunks; + }; + + static_assert( + sizeof(carve_info_t) == 2 * sizeof(size_t), + "carve_info_t must be 2*size_t so table_.carve_info[sc] indexes " + "with a single shift+add"); + + /** + * Map a request size to its bitmap-scan record. + * + * `n_chunks` must be in `[1, max_supported_chunks()]`. + * Not `constexpr`: uses `bits::clz` intrinsic via `bits::to_exp_mant` + * to stay single-cycle on the fast path. + */ + SNMALLOC_FAST_PATH static const bitmap_info_t& + bitmap_info_for_request(size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(raw < MAX_SC); + return table_.bitmap_info[raw]; + } + + /// Map a request size to its carve record. Preconditions and + /// properties as `bitmap_info_for_request`. + SNMALLOC_FAST_PATH static const carve_info_t& + carve_info_for_request(size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(raw < MAX_SC); + return table_.carve_info[raw]; + } + + /** + * Bin id of `block`. Operates on arbitrary chunk counts, not just + * exact size classes. `block.size` must be >= 1. + * + * A bin id at exponent `e` identifies the *servable set*: the + * subset of SCs at `e` that `block` could serve. Two blocks with + * the same servable set at the same exponent share a bin id. + * + * The natural exponent is `e = prev_pow2_bits(block.size)`. If + * alignment padding eats every SC there, we drop to `e - 1`, + * which is guaranteed to fit: its smallest SC has size and + * alignment `2^(e-1)`, so worst-case `size + pad < 2^e <= + * block.size`. One drop is always enough. + * + * Not `constexpr`: uses `bits::clz` via `bits::prev_pow2_bits`. + */ + SNMALLOC_FAST_PATH static size_t bin_index(range_t block) + { + SNMALLOC_ASSERT(block.size >= 1); + + size_t e = bits::prev_pow2_bits(block.size); + size_t offset = bin_offset_at(block.base, block.size, e); + if (SNMALLOC_UNLIKELY(offset == BINS_PER_EXP)) + { + // Padding ate the natural exponent. Drop one and retry. Proof + // of single-step termination is in the doc comment above. + SNMALLOC_ASSERT(e > 0); + e--; + offset = bin_offset_at(block.base, block.size, e); + SNMALLOC_ASSERT(offset != BINS_PER_EXP); + } + return table_.exp_bin_base[e] + offset; + } + + public: + /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. + static constexpr size_t max_supported_chunks() + { + return bits::from_exp_mant(MAX_SC - 1); + } + + /** + * Carve a free block into pre-pad / aligned request / post-pad. + * + * Preconditions (caller must have used `Bitmap::find_for_request` + * to locate a servable bin): + * - `block.size > 0`, `n_chunks` in `[1, max_supported_chunks()]`, + * `block` large enough to fit the SC after aligning up. + * - `block.base + block.size` does not wrap. + * + * Pure: does not touch the bitmap or any tree. Either or both + * `pre` / `post` may have `size == 0`; their `base` is still set + * to the natural address so `pre.base + pre.size == req.base` and + * `req.base + req.size == post.base` (keeps caller adjacency + * checks simple). + */ + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n_chunks) + { + SNMALLOC_ASSERT(n_chunks >= 1); + SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + SNMALLOC_ASSERT(block.size > 0); + // Combined with the servability precondition, non-wrapping end + // ensures the alignment-up below does not wrap either. + SNMALLOC_ASSERT(block.base + block.size >= block.base); + + const carve_info_t& info = carve_info_for_request(n_chunks); + + size_t req_base = + (block.base + (info.align_chunks - 1)) & ~(info.align_chunks - 1); + size_t pre_size = req_base - block.base; + + SNMALLOC_ASSERT(pre_size <= block.size); + SNMALLOC_ASSERT(block.size - pre_size >= info.size_chunks); + + size_t post_base = req_base + info.size_chunks; + size_t post_size = (block.base + block.size) - post_base; + + carve_t result; + result.pre = {block.base, pre_size}; + result.req = {req_base, info.size_chunks}; + result.post = {post_base, post_size}; + return result; + } + + /** + * Bitmap of non-empty per-arena bins. One bit per bin id + * (`bin_index`'s output); set iff the corresponding RB-tree is + * non-empty. + * + * Three-method API: + * - `add(range_t)`: classify a block and set its bin's bit + * (idempotent on the bit; returns the bin id). + * - `find_for_request(n_chunks)`: smallest set bin whose blocks + * all serve `n_chunks`, or `SIZE_MAX` if none. + * - `clear(bin_id)`: mark empty. Caller must ensure the bin's + * tree is actually empty; the bitmap does not track contents. + * + * Not thread-safe: callers sharing an arena must serialise the + * add / find / clear sequence under an external mutex. + */ + class Bitmap + { + friend struct ArenaBinsTestAccess; + + public: + /// Strict upper bound on bin ids `bin_index` produces. Exposed + /// so callers can size parallel arrays (one RB-tree per bin id). + static constexpr size_t TOTAL_BINS = BINS_PER_EXP * bits::BITS; + + constexpr Bitmap() : words_{} {} + + /** + * Classify `block`, set its bin's bit, return the bin id. + * + * Idempotent on bitmap state: if the bit is already set, this + * is a no-op (the bin id is still returned). + * + * The bitmap does NOT track which `(base, size)` ranges live in + * each bin's tree — the caller is responsible for inserting + * `block` into the appropriate tree. + */ + SNMALLOC_FAST_PATH size_t add(range_t block) + { + SNMALLOC_ASSERT(block.size >= 1); + SNMALLOC_ASSERT(block.size <= max_supported_chunks()); + size_t bin_id = bin_index(block); + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + words_[bin_id / bits::BITS] |= + (size_t(1) << (bin_id & (bits::BITS - 1))); + return bin_id; + } + + /// Mark bin `bin_id` empty. Caller must ensure the bin's tree + /// is actually empty; the bitmap does not consult the trees. + SNMALLOC_FAST_PATH void clear(size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + words_[bin_id / bits::BITS] &= + ~(size_t(1) << (bin_id & (bits::BITS - 1))); + } + + /** + * Smallest bin id whose set blocks all serve `n_chunks`, or + * `SIZE_MAX` if none. `n_chunks` in `[1, max_supported_chunks()]`. + * + * Invariant (static_assert below): `BINS_PER_EXP <= bits::BITS`, + * so the within-exponent range fits inside one word and the + * search straddles at most one word boundary. After the second + * word, every remaining word is purely higher-exponent. + */ + SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const + { + const bitmap_info_t& info = bitmap_info_for_request(n_chunks); + SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); + SNMALLOC_ASSUME(info.start_word < NUM_BITMAP_WORDS); + + // First word: start bin + any within-exp neighbours in same word. + size_t word = info.start_word; + size_t bits = words_[word] & info.first_mask; + if (bits != 0) + return word * bits::BITS + bits::ctz(bits); + ++word; + if (word == NUM_BITMAP_WORDS) + return SIZE_MAX; + + // Second word: within-exp carry plus any higher-exp bits. + bits = words_[word] & info.second_mask; + if (bits != 0) + return word * bits::BITS + bits::ctz(bits); + + // Remaining words: purely higher-exponent, any bit serves. + while (++word < NUM_BITMAP_WORDS) + if (words_[word] != 0) + return word * bits::BITS + bits::ctz(words_[word]); + return SIZE_MAX; + } + + private: + /// Number of size_t words backing the bitmap. Internal layout. + static constexpr size_t NUM_BITMAP_WORDS = + (TOTAL_BINS + bits::BITS - 1) / bits::BITS; + + static_assert( + TOTAL_BINS == BINS_PER_EXP * bits::BITS, + "Bitmap layout: TOTAL_BINS must be BINS_PER_EXP * bits::BITS so it " + "divides evenly into bits::BITS-sized words."); + static_assert( + NUM_BITMAP_WORDS == BINS_PER_EXP, + "Bitmap layout: with the canonical TOTAL_BINS, the word count is " + "exactly BINS_PER_EXP."); + static_assert( + TOTAL_BINS < SIZE_MAX, + "find_for_request returns SIZE_MAX as the 'no match' sentinel; " + "TOTAL_BINS must be strictly less than SIZE_MAX so no valid bin " + "id can collide with the sentinel."); + static_assert( + BINS_PER_EXP <= bits::BITS, + "find_for_request assumes the within-exponent range (at most " + "BINS_PER_EXP bins) fits inside a single word, so the search " + "straddles at most one word boundary. If a future B pushes " + "BINS_PER_EXP above bits::BITS, the two-word body must be " + "generalised to handle a multi-word straddle."); + + size_t words_[NUM_BITMAP_WORDS]; + }; + + private: + // Vocabulary used in the rest of the private implementation: + // + // exponent (e) : the bin-scheme exponent of a size; one axis of + // the size class grid. + // mantissa (m) : the within-exponent position, in + // [0, MANTISSAS_PER_EXP). The other axis. When + // passed as a single argument it is named `m` + // (e.g. `start_bin_offset_for_m(m)`). + // subset : a bitmask of mantissas. `bin_subsets[b]` is the + // set of mantissas bin offset `b` can serve. + // m_top : when discussing a particular bin, the maximum + // element of its subset. Used as the bucketing + // axis for the cascade (see `bin_offset_at`). + // m_test : a single-mantissa probe in a cascade step; + // chosen so the probe's outcome disambiguates + // one candidate bin from the rest. + + /** + * Single source of truth for the bin scheme. + * + * `bin_subsets[b]` is a bitmask of the mantissas bin offset `b` + * can serve: bit `m` set iff bin offset `b`'s servable subset + * contains mantissa `m`. The canonical bin numbering matches + * `prototype/skip_analysis.py`. Everything else in this file -- + * `start_bin_offset_for_m`, `serve_mask_for_m`, the per-SC + * `start_word` / `first_mask` / `second_mask`, and the per-m_top + * decision lists in `BinTable::cascade_steps` -- is derived + * (constexpr) from this table. + * + * Required invariant (checked at constexpr build time in + * `BinTable::BinTable`; violating it fails the build): for every + * `m_top`, the bins whose subset has `m_top` as max element form a + * strict containment chain when sorted by subset size descending. + * That is, the largest such subset properly contains the next, + * which properly contains the one after, and so on. The chain + * property is what makes the single-mantissa-probe cascade in + * `bin_offset_at` sufficient to disambiguate among them. + * + * If you edit the literals below, re-run + * `prototype/skip_analysis.py` to verify they still match the + * canonical numbering and chain property. + */ + static constexpr ModArray bin_subsets = []() { + ModArray r{}; + if constexpr (B == 1) + { + // bin 0: {0} + // bin 1: {0,1} + r[0] = 0b01; + r[1] = 0b11; + } + else if constexpr (B == 2) + { + // bin 0: {0} bin 3: {0,1,2} + // bin 1: {1} bin 4: {0,1,2,3} + // bin 2: {0,1} + r[0] = 0b0001; + r[1] = 0b0010; + r[2] = 0b0011; + r[3] = 0b0111; + r[4] = 0b1111; + } + else /* B == 3 */ + { + // bin 0: {0} bin 7: {1,2,3,5} + // bin 1: {1} bin 8: {0,1,2,3,4} + // bin 2: {0,1} bin 9: {0,1,2,3,5} + // bin 3: {1,2} bin 10: {0,1,2,3,4,5} + // bin 4: {0,1,2} bin 11: {0,1,2,3,4,5,6} + // bin 5: {1,2,3} bin 12: {0,1,2,3,4,5,6,7} + // bin 6: {0,1,2,3} + r[0] = 0b00000001; + r[1] = 0b00000010; + r[2] = 0b00000011; + r[3] = 0b00000110; + r[4] = 0b00000111; + r[5] = 0b00001110; + r[6] = 0b00001111; + r[7] = 0b00101110; + r[8] = 0b00011111; + r[9] = 0b00101111; + r[10] = 0b00111111; + r[11] = 0b01111111; + r[12] = 0b11111111; + } + return r; + }(); + + /** + * First within-exponent bin offset whose subset contains mantissa + * `m`. Derived from `bin_subsets`. + * + * Combined with the per-exponent base, this is an SC's absolute + * start bin index: `start_bit = exp_bin_base[e] + + * start_bin_offset_for_m(m)`. The bitmap stores its low and high + * halves pre-shifted into the `bitmap_info_t::first_mask` / + * `second_mask` fields. + */ + static constexpr size_t start_bin_offset_for_m(size_t m) + { + size_t mask = size_t(1) << m; + for (size_t b = 0; b < BINS_PER_EXP; b++) + if (bin_subsets[b] & mask) + return b; + return BINS_PER_EXP; // unreachable: every m is in some subset + } + + /** + * Bitmask, relative to `start_bin_offset_for_m(m)`, of bins that + * serve `m`. Bit `k` is set iff bin offset + * `start_bin_offset_for_m(m) + k` serves a request whose + * within-exponent position is `m`. The start bin always serves + * (bit 0 set), within-exponent bins serve iff their subset + * contains `m`, and bins above the within-exponent range belong + * to higher exponents and always serve (high bits all 1). + * + * Built positively (set bit = "serve") rather than as a "skip" + * mask: the hot path in `Bitmap::find_for_request` AND's this + * mask (pre-shifted into `bitmap_info_t::first_mask` / `second_mask`) + * against the bitmap word without an intermediate NOT. + */ + static constexpr size_t serve_mask_for_m(size_t m) + { + size_t mask = size_t(1) << m; + size_t start = start_bin_offset_for_m(m); + size_t result = ~size_t(0); + for (size_t b = start + 1; b < BINS_PER_EXP; b++) + if (!(bin_subsets[b] & mask)) + result &= ~(size_t(1) << (b - start)); + return result; + } + + /// Constexpr popcount: small loop, used only at BinTable build time. + static constexpr size_t popcount_const(size_t x) + { + size_t n = 0; + while (x != 0) + { + n += (x & 1); + x >>= 1; + } + return n; + } + + /// One step of a per-m_top decision list used by `bin_offset_at`. + /// If `m_test == NO_TEST` (see below) or `fits(m_test)` is true, + /// return `bin`. + struct CascadeStep + { + size_t m_test; + size_t bin; + }; + + /// Sentinel for `CascadeStep::m_test` meaning "take this bin + /// unconditionally". Any value `>= MANTISSAS_PER_EXP` would do; the + /// fits() lambda would short-circuit it on `first + m >= past`, but + /// the explicit sentinel makes the walker's intent obvious and + /// avoids one unnecessary comparison. + static constexpr size_t NO_TEST = MANTISSAS_PER_EXP; + + /** + * Maximum decision-list length per `m_top`. Derived from + * `bin_subsets`: the largest number of bins sharing the same max + * subset element. Used to size `cascade_steps[m_top][]`; some + * `m_top` values have fewer candidates, leaving default-initialised + * slots at the end. Those slots are never reached because the + * preceding NO_TEST entry always returns. + */ + static constexpr size_t MAX_CASCADE_STEPS = []() { + size_t mx = 0; + for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++) + { + size_t cnt = 0; + for (size_t b = 0; b < BINS_PER_EXP; b++) + { + // Bit m_top set and no higher bit set <=> max element is m_top. + if ((bin_subsets[b] >> m_top) == 1) + cnt++; + } + if (cnt > mx) + mx = cnt; + } + return mx; + }(); + + /** + * Within-exponent bin offset for a block at `addr_chunks` of length + * `n_chunks` at exponent `e`. Returns `BINS_PER_EXP` (sentinel) if + * no mantissa at this exponent fits. + * + * Walks `m_top` from `MANTISSAS_PER_EXP - 1` down. The first + * fitting `m_top` is the largest mantissa this block can serve; + * it is also the natural bucketing axis, because the bins whose + * subset has `m_top` as max element are exactly the candidates we + * still need to disambiguate among. `table_.cascade_steps[m_top]` + * (a constexpr-built decision list, derived from `bin_subsets`) + * disambiguates among them with at most a couple of secondary + * `fits` checks. + * + * Worst case: `MANTISSAS_PER_EXP + MAX_CASCADE_STEPS - 1` fit + * checks — the inner loop's last entry is the NO_TEST default and + * returns without calling `fits`. Typical: 1-2 at the natural + * exponent and 1 at the fallback exponent. + */ + SNMALLOC_FAST_PATH static size_t + bin_offset_at(size_t addr_chunks, size_t n_chunks, size_t e) + { + size_t first = table_.exp_first_sc[e]; + size_t past = table_.exp_first_sc[e + 1]; + + auto fits = [&](size_t m) SNMALLOC_FAST_PATH_LAMBDA -> bool { + // Safety: mantissa m may not exist at this exponent (low + // regime -- exponents 0..B-1 have fewer than 2^B mantissas; + // for any B the very first exponent has only 1). Without this + // check we would index past `past` into the carve_info table. + if (first + m >= past) + return false; + const carve_info_t& ci = table_.carve_info[first + m]; + // Optimisation: near the bottom of n_chunks's exponent range + // the higher-mantissa sizes already exceed n_chunks and cannot + // fit regardless of alignment. Skips the align_up below. + if (n_chunks < ci.size_chunks) + return false; + size_t pad = bits::align_up(addr_chunks, ci.align_chunks) - addr_chunks; + return n_chunks - ci.size_chunks >= pad; + }; + + for (size_t m_top = MANTISSAS_PER_EXP; m_top-- > 0;) + { + if (fits(m_top)) + { + // Walk this m_top's decision list. The list always ends with + // a NO_TEST entry that acts as the default, so the loop is + // guaranteed to return. + for (size_t j = 0; j < MAX_CASCADE_STEPS; j++) + { + const CascadeStep& step = table_.cascade_steps[m_top][j]; + if (step.m_test == NO_TEST || fits(step.m_test)) + return step.bin; + } + SNMALLOC_ASSERT(false); // unreachable per the invariant above + } + } + return BINS_PER_EXP; + } + + /** + * Constexpr-populated rodata tables. + * + * `bitmap_info[sc]` is the bitmap-scan record for each in-range + * sc (consumed by `Bitmap::find_for_request`). + * `carve_info[sc]` is the size/alignment record for each in-range + * sc (consumed by `carve` and by `bin_offset_at`'s `fits` + * predicate during free-side classification). + * `exp_first_sc[e]` is the first raw sc id at ArenaBins + * exponent e (with `exp_first_sc[bits::BITS] = MAX_SC` as a sentinel + * so `[exp_first_sc[e], exp_first_sc[e + 1])` is a valid raw range + * for every `e < bits::BITS`). + * `exp_bin_base[e]` is `e * BINS_PER_EXP`, precomputed so the + * `bin_index` fast path never performs a runtime multiply. + * `cascade_steps[m_top]` is the decision list `bin_offset_at` walks + * once it knows `m_top` is the largest fitting mantissa at the + * current exponent. The list always ends with a NO_TEST entry that + * acts as the default. + */ + struct BinTable + { + ModArray bitmap_info{}; + ModArray carve_info{}; + ModArray exp_first_sc{}; + ModArray exp_bin_base{}; + ModArray> + cascade_steps{}; + + constexpr BinTable() + { + // Boundary tables: keep all (e -> raw sc range) and (e -> bin id + // base) knowledge in two small ROM arrays. `to_exp_mant_const` is + // the only place that knows the size class encoding; once we've + // pinned down the raw boundaries, everything else is table lookup. + // + // Note: `exp_first_sc` does NOT have a uniform stride. At the + // bottom of the encoding the low regime (no leading-1 bit; the + // `b = (e == 0) ? 0 : 1` branch in `to_exp_mant_const`) squashes + // multiple ArenaBins exponents into encoded-exponent 0. + // For `B = 2` the counts are 1, 2, 4, 4, 4, ... + for (size_t e = 0; e < bits::BITS; e++) + { + exp_first_sc[e] = bits::to_exp_mant_const(size_t(1) << e); + exp_bin_base[e] = e * BINS_PER_EXP; + } + exp_first_sc[bits::BITS] = MAX_SC; + exp_bin_base[bits::BITS] = bits::BITS * BINS_PER_EXP; + + // Per-sc records. Size and alignment come straight from the + // size-class scheme (via from_exp_mant); start_word, first_mask, + // second_mask are derived from bin_subsets via the constexpr + // helpers above, pre-shifted into the bitmap's word layout so + // the search hot path is two ANDs. + for (size_t sc = 0; sc < MAX_SC; sc++) + { + size_t size = bits::from_exp_mant(sc); + size_t e = bits::prev_pow2_bits_const(size); + size_t m = sc - exp_first_sc[e]; + size_t start_bit = exp_bin_base[e] + start_bin_offset_for_m(m); + size_t mask = serve_mask_for_m(m); + size_t shift = start_bit & (bits::BITS - 1); + carve_info[sc].size_chunks = size; + carve_info[sc].align_chunks = size & (~size + 1); + bitmap_info[sc].start_word = start_bit / bits::BITS; + bitmap_info[sc].first_mask = mask << shift; + // shift == 0: no within-exponent carry; the second word is + // entirely higher-exponent. shift > 0: the low `shift` bits + // receive the top of mask (within-exp carry plus its all-1s + // tail), and bits [shift, BITS) are higher-exp and always + // serve. + bitmap_info[sc].second_mask = (shift == 0) ? + ~size_t(0) : + ((mask >> (bits::BITS - shift)) | (~size_t(0) << shift)); + } + + // cascade_steps: for each m_top, build a decision list of + // (m_test, bin) pairs derived from bin_subsets. Candidates are + // bins whose subset has m_top as max element; sort descending + // by subset size. The strict-chain invariant on `bin_subsets` + // (see its doc comment) guarantees each non-default + // candidate's subset properly contains the next candidate's, + // so the discriminator for candidate `i` is one of the + // mantissas in `bin_subsets[b_i] & ~bin_subsets[b_{i+1}]`. + for (size_t m_top = 0; m_top < MANTISSAS_PER_EXP; m_top++) + { + ModArray candidates{}; + size_t n_cand = 0; + for (size_t b = 0; b < BINS_PER_EXP; b++) + { + // bin_subsets[b] >> m_top == 1 <=> bit m_top set and no + // higher bit set <=> max element of subset is m_top. + if ((bin_subsets[b] >> m_top) == 1) + { + candidates[n_cand] = b; + n_cand++; + } + } + // Insertion sort, descending by popcount of subset. + for (size_t i = 1; i < n_cand; i++) + { + size_t b = candidates[i]; + size_t pcb = popcount_const(bin_subsets[b]); + size_t j = i; + while (j > 0 && + popcount_const(bin_subsets[candidates[j - 1]]) < pcb) + { + candidates[j] = candidates[j - 1]; + j--; + } + candidates[j] = b; + } + // Non-default candidates: pick a discriminating mantissa. + // Under the strict-chain invariant on `bin_subsets`, each + // candidate's subset properly contains the next candidate's, + // so `bin_subsets[b] & ~bin_subsets[b_next]` is the + // (non-empty) set of mantissas unique to this candidate. + for (size_t i = 0; i + 1 < n_cand; i++) + { + size_t b = candidates[i]; + size_t b_next = candidates[i + 1]; + size_t discrim_set = bin_subsets[b] & ~bin_subsets[b_next]; + // If this fires, `bin_subsets` violates the strict-chain + // invariant: candidate `b`'s subset does not properly + // contain candidate `b_next`'s, so the cascade can't be + // expressed as single-mantissa probes. `throw` makes the + // constexpr evaluation non-constant and surfaces the + // violation as a compile error. + if (discrim_set == 0) + throw "bin_subsets violates strict-chain invariant"; + cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set); + cascade_steps[m_top][i].bin = b; + } + // Default (last) candidate. + cascade_steps[m_top][n_cand - 1].m_test = NO_TEST; + cascade_steps[m_top][n_cand - 1].bin = candidates[n_cand - 1]; + } + } + }; + + static constexpr BinTable table_{}; + }; +} // namespace snmalloc diff --git a/src/snmalloc/ds_core/bits.h b/src/snmalloc/ds_core/bits.h index 3391e70f7..57a5a0e73 100644 --- a/src/snmalloc/ds_core/bits.h +++ b/src/snmalloc/ds_core/bits.h @@ -288,6 +288,21 @@ namespace snmalloc return BITS - clz_const(x - 1); } + /** + * Returns `floor(log2(x))`, i.e. the bit index of the highest set bit + * of `x`. Correct for `x >= 1`; calling with `x == 0` is UB (it would + * call `clz(0)`, whose precondition is `x != 0`). + */ + inline SNMALLOC_FAST_PATH size_t prev_pow2_bits(size_t x) + { + return BITS - 1 - clz(x); + } + + constexpr size_t prev_pow2_bits_const(size_t x) + { + return BITS - 1 - clz_const(x); + } + constexpr SNMALLOC_FAST_PATH size_t align_down(size_t value, size_t alignment) { @@ -352,6 +367,35 @@ namespace snmalloc return (e << MANTISSA_BITS) + m; } + /** + * Runtime counterpart of `to_exp_mant_const`. Identical semantics, but + * uses the `clz` intrinsic instead of the 64-iteration `clz_const` + * loop, which makes it suitable for the allocation fast path. + * + * Requires `MANTISSA_BITS + LOW_BITS > 0` so that `value | LEADING_BIT` + * is never zero, satisfying `clz`'s precondition. + */ + template + inline SNMALLOC_FAST_PATH size_t to_exp_mant(size_t value) + { + static_assert( + MANTISSA_BITS + LOW_BITS > 0, + "to_exp_mant requires MANTISSA_BITS + LOW_BITS > 0 so that " + "value | LEADING_BIT is non-zero (clz precondition)"); + + constexpr size_t LEADING_BIT = one_at_bit(MANTISSA_BITS + LOW_BITS) >> 1; + constexpr size_t MANTISSA_MASK = mask_bits(MANTISSA_BITS); + + value = value - 1; + + size_t e = + bits::BITS - MANTISSA_BITS - LOW_BITS - clz(value | LEADING_BIT); + size_t b = (e == 0) ? 0 : 1; + size_t m = (value >> (LOW_BITS + e - b)) & MANTISSA_MASK; + + return (e << MANTISSA_BITS) + m; + } + template constexpr size_t from_exp_mant(size_t m_e) { diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc new file mode 100644 index 000000000..c432048b9 --- /dev/null +++ b/src/test/func/arenabins/arenabins.cc @@ -0,0 +1,1220 @@ +/** + * Unit tests for ArenaBins. + * + * Exercises: + * - the chunk size class encoding (via `ArenaBinsTestAccess`), + * - the private bin classification (`bin_index`), + * - the narrow public surface: `Bitmap::add` / `find_for_request` / + * `clear`, and the pure `carve(range_t, n)` decomposition. + * + * Strategy: brute force. For each (addr_chunks, n_chunks) on a small grid + * we directly check whether a block can serve every candidate size class + * (by finding an aligned sub-range that fits via `can_serve`, and + * consulting the canonical `bin_subsets` table via `serves`), and + * compare against what `bin_index` predicts. Bitmap behaviour is + * cross-checked against a slow reference scanner that formulates + * "bin b serves request n" directly in terms of the canonical + * `bin_subsets` table; raw word access for tests goes through + * `ArenaBinsTestAccess::raw_*`. + */ + +#include "test/setup.h" +#include "test/snmalloc_testlib.h" + +#include +#include +#include +#include +#include + +namespace snmalloc +{ + /** + * Friend struct exposing private internals of `ArenaBins` + * (and its nested `Bitmap`) for unit tests. Forward-declared in + * `arenabins.h`; defined here so the production header + * carries no test-only surface. + */ + template + struct ArenaBinsTestAccess + { + using Bins = ArenaBins; + + using Bitmap = typename Bins::Bitmap; + using range_t = typename Bins::range_t; + using carve_t = typename Bins::carve_t; + using bitmap_info_t = typename Bins::bitmap_info_t; + using carve_info_t = typename Bins::carve_info_t; + + static constexpr size_t B = Bins::B; + static constexpr size_t MANTISSAS_PER_EXP = Bins::MANTISSAS_PER_EXP; + static constexpr size_t BINS_PER_EXP = Bins::BINS_PER_EXP; + static constexpr size_t MAX_SC = Bins::MAX_SC; + + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n) + { + return Bins::carve(block, n); + } + + SNMALLOC_FAST_PATH static const bitmap_info_t& + bitmap_info_for_request(size_t n) + { + return Bins::bitmap_info_for_request(n); + } + + SNMALLOC_FAST_PATH static const carve_info_t& + carve_info_for_request(size_t n) + { + return Bins::carve_info_for_request(n); + } + + SNMALLOC_FAST_PATH static size_t bin_index(range_t block) + { + return Bins::bin_index(block); + } + + static constexpr size_t max_supported_chunks() + { + return Bins::max_supported_chunks(); + } + + // --- Raw size-class id access --- + // + // The bin scheme assigns a dense raw id in `[0, MAX_SC)` to each + // size class. Production code never names these (the fast path + // goes straight from request size to the bitmap-scan / carve + // record). Tests cross-check the encoding via the helpers below; + // the alias `chunk_sc_t = size_t` preserves the existing test + // naming. + + using chunk_sc_t = size_t; + + /// Raw id of the smallest size class >= n_chunks. + SNMALLOC_FAST_PATH static chunk_sc_t request(size_t n) + { + SNMALLOC_ASSERT(n >= 1); + SNMALLOC_ASSERT(n <= Bins::max_supported_chunks()); + return bits::to_exp_mant(n); + } + + static constexpr size_t size_chunks(chunk_sc_t sc) + { + return Bins::table_.carve_info[sc].size_chunks; + } + + static constexpr size_t align_chunks(chunk_sc_t sc) + { + return Bins::table_.carve_info[sc].align_chunks; + } + + SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(chunk_sc_t sc) + { + SNMALLOC_ASSERT(sc < Bins::MAX_SC); + return Bins::table_.bitmap_info[sc]; + } + + SNMALLOC_FAST_PATH static const carve_info_t& carve_info(chunk_sc_t sc) + { + SNMALLOC_ASSERT(sc < Bins::MAX_SC); + return Bins::table_.carve_info[sc]; + } + + /// `bitmap_info_for_request`, constexpr (uses `to_exp_mant_const`). + /// Only used in `static_assert`s. + static constexpr const bitmap_info_t& + bitmap_info_for_request_const(size_t n) + { + return Bins::table_ + .bitmap_info[bits::to_exp_mant_const(n)]; + } + + /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). + /// Only used in `static_assert`s. + static constexpr const carve_info_t& carve_info_for_request_const(size_t n) + { + return Bins::table_ + .carve_info[bits::to_exp_mant_const(n)]; + } + + // The canonical source of truth for what each within-exponent bin + // offset can serve. Tests express the conceptual "bin b serves + // request n" predicate directly in terms of this table so they do + // not depend on the bitmap's pre-shifted layout. + static constexpr const auto& bin_subsets = Bins::bin_subsets; + + // --- Bitmap raw-word access --- + // + // The public Bitmap API is narrow (add / find_for_request / clear). + // Tests need to: + // - set up arbitrary bitmap states (single bit, exhaustive patterns) + // without going through `add` (which classifies a (base, size) + // range and so is constrained by what classifications exist). + // - inspect bitmap state after operations (test "exactly this bit is + // set" and "no other bit changed"). + // These accessors expose the raw word storage to do that. + + static constexpr size_t NUM_BITMAP_WORDS = Bitmap::NUM_BITMAP_WORDS; + + /// Set bit `bin_id` directly in the bitmap, bypassing + /// classification. For exhaustive bit-pattern tests. + static void raw_set(Bitmap& b, size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS); + b.words_[bin_id / bits::BITS] |= + (size_t(1) << (bin_id & (bits::BITS - 1))); + } + + /// Test whether bit `bin_id` is set in the bitmap. + static bool raw_has(const Bitmap& b, size_t bin_id) + { + SNMALLOC_ASSERT(bin_id < Bitmap::TOTAL_BINS); + return (b.words_[bin_id / bits::BITS] >> (bin_id & (bits::BITS - 1))) & + size_t(1); + } + + /// Whether the bitmap has no bits set. + static bool raw_empty(const Bitmap& b) + { + for (size_t i = 0; i < Bitmap::NUM_BITMAP_WORDS; i++) + if (b.words_[i] != 0) + return false; + return true; + } + + /// Read a raw word of the bitmap; for assertions like "only this + /// word is non-zero" or "the words round-trip exactly". + static size_t raw_word(const Bitmap& b, size_t word_idx) + { + SNMALLOC_ASSERT(word_idx < Bitmap::NUM_BITMAP_WORDS); + return b.words_[word_idx]; + } + }; +} // namespace snmalloc + +using snmalloc::ArenaBinsTestAccess; + +// Compile-time checks: a few size-class encoding properties that we want +// to fail the build (not the runtime) if regressed. +namespace static_checks +{ + using B1 = ArenaBinsTestAccess<1>; + using B2 = ArenaBinsTestAccess<2>; + using B3 = ArenaBinsTestAccess<3>; + + static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP"); + static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP"); + static_assert(B3::BINS_PER_EXP == 13, "B=3 BINS_PER_EXP"); + + static_assert( + B1::MAX_SC == ((snmalloc::bits::BITS - 1) << 1) + ((1 << 1) - 1), + "B=1 MAX_SC"); + static_assert( + B2::MAX_SC == ((snmalloc::bits::BITS - 2) << 2) + ((1 << 2) - 1), + "B=2 MAX_SC"); + static_assert( + B3::MAX_SC == ((snmalloc::bits::BITS - 3) << 3) + ((1 << 3) - 1), + "B=3 MAX_SC"); + + // Sizes that are powers of two have align == size. + static_assert( + B2::carve_info_for_request_const(4).align_chunks == 4, "size 4 align"); + static_assert( + B3::carve_info_for_request_const(8).align_chunks == 8, "size 8 align"); + + // size_chunks at request(s) must be >= s. + static_assert( + B2::carve_info_for_request_const(9).size_chunks == 10, "B=2 round-up"); + static_assert( + B3::carve_info_for_request_const(17).size_chunks == 18, "B=3 round-up"); +} // namespace static_checks + +namespace +{ + /// Conceptual predicate, expressed directly in terms of the canonical + /// `bin_subsets` table (the single source of truth for the bin + /// scheme). Bin `b` serves a request of size `n` iff `b`'s exponent + /// strictly exceeds `n`'s (any higher-exponent block is big enough), + /// or they share an exponent and `b`'s within-exponent subset + /// includes `n`'s mantissa. + /// + /// This is the reference both for what `find_for_request` must + /// return and for what `bin_index` must classify into. + template + constexpr bool serves(size_t bin, size_t n) + { + using Bins = ArenaBinsTestAccess; + size_t e_b = bin / Bins::BINS_PER_EXP; + size_t o_b = bin % Bins::BINS_PER_EXP; + size_t raw = snmalloc::bits::to_exp_mant_const(n); + size_t size_n = snmalloc::bits::from_exp_mant(raw); + size_t e_n = snmalloc::bits::prev_pow2_bits_const(size_n); + if (e_b < e_n) + return false; + if (e_b > e_n) + return true; + size_t exp_first = + snmalloc::bits::to_exp_mant_const(size_t(1) << e_n); + size_t m_n = raw - exp_first; + return ((Bins::bin_subsets[o_b] >> m_n) & size_t(1)) != 0; + } + + /// Return true iff a block of `n` chunks starting at chunk-aligned address + /// `addr` can serve a size class of size `s` chunks with natural alignment + /// `a` chunks. Brute-force search for an aligned sub-range that fits. + bool can_serve(size_t addr, size_t n, size_t s, size_t a) + { + if (s == 0 || s > n) + return false; + // Find first a-aligned address in [addr, addr + n - s]. + size_t mod = addr & (a - 1); + size_t first = (mod == 0) ? addr : (addr + (a - mod)); + return first + s <= addr + n; + } + + template + void check_chunk_sc_roundtrip() + { + using Bins = ArenaBinsTestAccess; + + // Properties (together these imply request is the smallest size class + // with size >= s): + // 1. size_chunks(request(s)) >= s for all s >= 1. + // 2. Idempotence: request(size_chunks(sc)) == sc. + // 3. Monotonicity: s1 <= s2 implies request(s1) <= request(s2). + auto prev_sc = Bins::request(1); + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + size_t cs = Bins::size_chunks(sc); + if (cs < s) + { + std::printf( + "B=%zu request(%zu) gave class with size %zu < %zu\n", B, s, cs, s); + std::abort(); + } + if (Bins::request(cs) != sc) + { + std::printf("B=%zu request(size_chunks(sc))!=sc for cs=%zu\n", B, cs); + std::abort(); + } + if (sc < prev_sc) + { + std::printf("B=%zu request not monotone at s=%zu\n", B, s); + std::abort(); + } + prev_sc = sc; + } + } + + template + void check_align_chunks() + { + using Bins = ArenaBinsTestAccess; + + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + size_t cs = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + // a must be a power of two. + if (a == 0 || (a & (a - 1)) != 0) + { + std::printf("B=%zu size %zu: align_chunks %zu not pow2\n", B, cs, a); + std::abort(); + } + // a must divide cs. + if (cs % a != 0) + { + std::printf( + "B=%zu size %zu: align_chunks %zu does not divide size\n", B, cs, a); + std::abort(); + } + // a should be the LARGEST power of two dividing cs. + if ((a << 1) != 0 && cs % (a << 1) == 0) + { + std::printf( + "B=%zu size %zu: align_chunks %zu not the largest pow2 divisor\n", + B, + cs, + a); + std::abort(); + } + } + } + + /// Collect all chunk_sc_t classes whose size fits in the test grid. + template + std::vector::chunk_sc_t> + collect_classes(size_t max_size) + { + using Bins = ArenaBinsTestAccess; + using sc_t = typename Bins::chunk_sc_t; + + std::vector v; + sc_t prev{}; + bool have_prev = false; + for (size_t s = 1; s <= max_size; s++) + { + sc_t sc = Bins::request(s); + if (Bins::size_chunks(sc) != s) + continue; // s is not a class size + if (!have_prev || sc != prev) + { + v.push_back(sc); + prev = sc; + have_prev = true; + } + } + return v; + } + + template + void check_bin_classification(size_t max_addr, size_t max_n) + { + using Bins = ArenaBinsTestAccess; + auto classes = collect_classes(max_n); + + for (size_t addr = 0; addr < max_addr; addr++) + { + for (size_t n = 1; n <= max_n; n++) + { + size_t bin = Bins::bin_index({addr, n}); + + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + bool actually = can_serve(addr, n, s, a); + bool predicted = serves(bin, s); + + if (predicted != actually) + { + std::printf( + "B=%zu addr=%zu n=%zu bin=%zu sc.size=%zu sc.align=%zu: " + "predicted=%d actually=%d\n", + B, + addr, + n, + bin, + s, + a, + (int)predicted, + (int)actually); + std::abort(); + } + } + } + } + } + + template + void check_bin_id_range() + { + using Bins = ArenaBinsTestAccess; + + // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the + // block's natural exponent e. + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + size_t bin = Bins::bin_index({addr, n}); + size_t within = bin % Bins::BINS_PER_EXP; + if (within >= Bins::BINS_PER_EXP) + { + std::printf( + "B=%zu addr=%zu n=%zu bin=%zu: within-exp id %zu >= BINS_PER_EXP " + "%zu\n", + B, + addr, + n, + bin, + within, + Bins::BINS_PER_EXP); + std::abort(); + } + } + } + } + + /// Verify that `*_info_for_request(n)` agrees with the per-sc + /// accessors for every n in a range. + template + void check_info_consistency() + { + using Bins = ArenaBinsTestAccess; + + for (size_t s = 1; s <= 4096; s++) + { + auto sc = Bins::request(s); + + // carve_info_for_request(s) must match the per-sc accessors and + // must alias the carve_info(request(s)) record (single table + // indirection, no copy). + const auto& ci = Bins::carve_info_for_request(s); + if (ci.size_chunks != Bins::size_chunks(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu).size_chunks mismatch\n", B, s); + std::abort(); + } + if (ci.align_chunks != Bins::align_chunks(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu).align_chunks mismatch\n", B, s); + std::abort(); + } + if (&ci != &Bins::carve_info(sc)) + { + std::printf( + "B=%zu carve_info_for_request(%zu) and carve_info(request) " + "point at different records\n", + B, + s); + std::abort(); + } + + // bitmap_info_for_request(s) must alias bitmap_info(request(s)). + const auto& bi = Bins::bitmap_info_for_request(s); + if (&bi != &Bins::bitmap_info(sc)) + { + std::printf( + "B=%zu bitmap_info_for_request(%zu) and bitmap_info(request) " + "point at different records\n", + B, + s); + std::abort(); + } + } + } + + /// to_exp_mant runtime / _const equivalence across a representative + /// range of values, including edges near max_supported_chunks. The + /// runtime variant uses the intrinsic; we cross-check against the + /// constexpr reference that's already exercised at compile time. + template + void check_to_exp_mant_equivalence() + { + using Bins = ArenaBinsTestAccess; + + auto check_one = [&](size_t n) { + size_t r = snmalloc::bits::to_exp_mant(n); + size_t c = snmalloc::bits::to_exp_mant_const(n); + if (r != c) + { + std::printf("B=%zu to_exp_mant(%zu) = %zu, _const = %zu\n", B, n, r, c); + std::abort(); + } + }; + + // Small values. + for (size_t n = 1; n <= 4096; n++) + check_one(n); + + // Powers of two and ±1, up to the largest representable. + for (size_t e = 0; e < snmalloc::bits::BITS; e++) + { + size_t pow = size_t(1) << e; + if (pow == 0) + continue; + if (pow >= 1 && pow <= Bins::max_supported_chunks()) + check_one(pow); + if (pow + 1 <= Bins::max_supported_chunks()) + check_one(pow + 1); + if (pow >= 2) + check_one(pow - 1); + } + + // The upper boundary itself. + check_one(Bins::max_supported_chunks()); + if (Bins::max_supported_chunks() > 1) + check_one(Bins::max_supported_chunks() - 1); + + // A handful of stride values across the full range. + size_t step = Bins::max_supported_chunks() / 257; + if (step == 0) + step = 1; + for (size_t n = 1; n <= Bins::max_supported_chunks() && n > 0; + n += step + 1) + check_one(n); + } + + /// Reference implementation of find_for_request: brute-force scan + /// over every bin id, applying the canonical `serves` predicate + /// (defined directly in terms of `bin_subsets`). + template + size_t reference_find( + size_t n_chunks, const typename ArenaBinsTestAccess::Bitmap& bm) + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++) + { + if (!Bins::raw_has(bm, b)) + continue; + if (serves(b, n_chunks)) + return b; + } + return SIZE_MAX; + } + + template + void check_bitmap_smoke() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + Bitmap bm; + if (!Bins::raw_empty(bm)) + std::abort(); + Bins::raw_set(bm, 0); + if (Bins::raw_empty(bm)) + std::abort(); + if (!Bins::raw_has(bm, 0)) + std::abort(); + if (Bins::raw_has(bm, 1)) + std::abort(); + Bins::raw_set(bm, Bitmap::TOTAL_BINS - 1); + if (!Bins::raw_has(bm, Bitmap::TOTAL_BINS - 1)) + std::abort(); + bm.clear(0); + if (Bins::raw_has(bm, 0)) + std::abort(); + bm.clear(Bitmap::TOTAL_BINS - 1); + if (!Bins::raw_empty(bm)) + std::abort(); + } + + /// Iterate over every `chunk_sc_t` raw id in `[0, MAX_SC)`. For each + /// one, decode its request size, look up its `bitmap_info_t`, and + /// run `body(n_chunks, bitmap_info)`. Multiple raw ids can share the + /// same `(start_word, first_mask, second_mask)` triple; callers that + /// want a unique-deposit view are responsible for deduplicating. + template + void for_each_class_info(F body) + { + using Bins = ArenaBinsTestAccess; + for (size_t raw = 0; raw < Bins::MAX_SC; raw++) + { + size_t s = snmalloc::bits::from_exp_mant(raw); + const auto& info = Bins::bitmap_info_for_request(s); + body(s, info); + } + } + + template + void check_bitmap_find_empty() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + Bitmap bm; + for_each_class_info([&](size_t n, const auto& /*info*/) { + if (bm.find_for_request(n) != SIZE_MAX) + std::abort(); + }); + } + + /// For each B and each bin id in [0, TOTAL_BINS): set exactly that + /// bit, then for every distinct request info cross-check + /// find_for_request against the reference scanner. + template + void check_bitmap_exhaustive_single_bit() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + // Gather a representative set of entries (one per distinct bitmap + // deposit, i.e. distinct (start_word, first_mask, second_mask) + // triple, with a request size that maps to it). + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + for (size_t bin_id = 0; bin_id < Bitmap::TOTAL_BINS; bin_id++) + { + Bitmap bm; + Bins::raw_set(bm, bin_id); + for (const auto& e : entries) + { + size_t got = bm.find_for_request(e.n_chunks); + size_t want = reference_find(e.n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu single-bit: bin=%zu n=%zu: got=%zu want=%zu\n", + B, + bin_id, + e.n_chunks, + got, + want); + std::abort(); + } + } + } + } + + /// Randomised multi-bit arena states cross-checked against the + /// reference scanner. + template + void check_bitmap_multi_bit_random() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + // Deterministic xorshift64 PRNG so failures are reproducible. + auto xorshift = [](uint64_t& s) -> uint64_t { + s ^= s << 13; + s ^= s >> 7; + s ^= s << 17; + return s; + }; + + uint64_t rng_state = 0x9E3779B97F4A7C15ull + B; + for (size_t trial = 0; trial < 2000; trial++) + { + Bitmap bm; + // Density varies per trial: choose how many bits to set. + size_t target = (size_t)(xorshift(rng_state) % (Bitmap::TOTAL_BINS + 1)); + for (size_t i = 0; i < target; i++) + { + size_t b = (size_t)(xorshift(rng_state) % Bitmap::TOTAL_BINS); + Bins::raw_set(bm, b); + } + for (const auto& e : entries) + { + size_t got = bm.find_for_request(e.n_chunks); + size_t want = reference_find(e.n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu trial=%zu n=%zu: got=%zu want=%zu\n", + B, + trial, + e.n_chunks, + got, + want); + std::abort(); + } + } + } + } + + /// Targeted word-boundary cases: enumerate real table entries, pick + /// out those whose within-exp range straddles a bitmap word, and + /// drive each through a four-way sub-case grid: + /// (i) bit set in first word's considered region only + /// (ii) bit set as within-exp continuation in second word + /// (iii) bit set as higher-exp candidate in second word + /// (iv) bit set only in word 3 or beyond + template + void check_bitmap_word_boundary() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + auto check_predicted = + [&](const Bitmap& bm, size_t n_chunks, const char* label) { + size_t got = bm.find_for_request(n_chunks); + size_t want = reference_find(n_chunks, bm); + if (got != want) + { + std::printf( + "B=%zu word-boundary [%s] n=%zu: got=%zu want=%zu\n", + B, + label, + n_chunks, + got, + want); + std::abort(); + } + }; + + bool found_straddle = false; + bool found_aligned = false; + for (size_t raw = 0; raw < Bins::MAX_SC; raw++) + { + size_t s = snmalloc::bits::from_exp_mant(raw); + const auto& info = Bins::bitmap_info_for_request(s); + // Recover the absolute start bin from the precomputed layout: + // the start bin always serves, so bit 0 of the conceptual + // serve_mask is set, which means `first_mask`'s lowest set bit + // is at position `shift = start_bit & (BITS - 1)`. + size_t shift = snmalloc::bits::ctz(info.first_mask); + size_t start_bit = info.start_word * snmalloc::bits::BITS + shift; + size_t state = start_bit % Bins::BINS_PER_EXP; + size_t r = Bins::BINS_PER_EXP - state; + bool straddles = (shift + r) > snmalloc::bits::BITS; + bool aligned = (shift == 0); + + if (straddles) + found_straddle = true; + if (aligned) + found_aligned = true; + if (!(straddles || aligned)) + continue; + + // (i) Single bit at the very start_bit. + { + Bitmap bm; + Bins::raw_set(bm, start_bit); + check_predicted(bm, s, "case-i-start_bit"); + } + + // (ii) Single bit in the second word's within-exp continuation + // (only meaningful for straddling cases). + if (straddles) + { + size_t carry_bin = start_bit + (snmalloc::bits::BITS - shift); + if (carry_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, carry_bin); + check_predicted(bm, s, "case-ii-continuation"); + } + } + + // (iii) Bit in second word's higher-exp region. + { + size_t second_word = info.start_word + 1; + if (second_word < Bins::NUM_BITMAP_WORDS) + { + // Pick a bin that is higher-exponent: at least + // start_bit + BINS_PER_EXP - state (i.e. into next exponent). + size_t higher_bin = start_bit + r; + if (higher_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, higher_bin); + check_predicted(bm, s, "case-iii-higher-exp"); + } + } + } + + // (iv) Bit only in word 3 or beyond. + { + size_t target_word = info.start_word + 2; + if (target_word < Bins::NUM_BITMAP_WORDS) + { + size_t target_bin = target_word * snmalloc::bits::BITS; + if (target_bin < Bitmap::TOTAL_BINS) + { + Bitmap bm; + Bins::raw_set(bm, target_bin); + check_predicted(bm, s, "case-iv-later-word"); + } + } + } + } + + // Sanity: for B that actually places entries near word boundaries, + // at least one straddling case must exist on 64-bit. We don't assert + // straddle exists for all B (B=1's bins-per-exp = 2 might not + // straddle on 64-bit), but aligned cases must. + if (!found_aligned) + { + std::printf("B=%zu: no aligned start_bit found!\n", B); + std::abort(); + } + (void)found_straddle; + } + + /// Integration test: set bits by `bin_index(addr, n)`, then probe via + /// `find_for_request(req)`. The bitmap result must equal + /// `bin_index(addr, n)` whenever `can_serve` says the block satisfies + /// the request, and `SIZE_MAX` otherwise. + template + void check_bitmap_bin_index_integration() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + auto classes = collect_classes(64); + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + Bitmap bm; + size_t bin = Bins::bin_index({addr, n}); + Bins::raw_set(bm, bin); + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + bool actually = can_serve(addr, n, s, a); + size_t got = bm.find_for_request(s); + size_t want = actually ? bin : size_t(SIZE_MAX); + if (got != want) + { + std::printf( + "B=%zu integration: addr=%zu n=%zu bin=%zu sc.size=%zu " + "sc.align=%zu: got=%zu want=%zu actually=%d\n", + B, + addr, + n, + bin, + s, + a, + got, + want, + (int)actually); + std::abort(); + } + } + } + } + } + + /// Verify that Bitmap::add classifies (base, size) ranges to the same + /// bin id as `bin_index`, sets the corresponding bit, and is + /// idempotent on both the returned id and the underlying word state. + template + void check_bitmap_add() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + using range_t = typename Bins::range_t; + + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + Bitmap bm; + size_t expected = Bins::bin_index({addr, n}); + size_t got = bm.add(range_t{addr, n}); + if (got != expected) + { + std::printf( + "B=%zu add: addr=%zu n=%zu got=%zu expected=%zu\n", + B, + addr, + n, + got, + expected); + std::abort(); + } + if (!Bins::raw_has(bm, expected)) + { + std::printf( + "B=%zu add: addr=%zu n=%zu bin %zu not set after add\n", + B, + addr, + n, + expected); + std::abort(); + } + + // Snapshot every word, call add again, verify nothing changed + // and we get the same id back. Idempotence on state. + std::vector snapshot; + for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++) + snapshot.push_back(Bins::raw_word(bm, w)); + size_t got2 = bm.add(range_t{addr, n}); + if (got2 != expected) + { + std::printf( + "B=%zu add idempotent: addr=%zu n=%zu second add returned " + "%zu (first returned %zu)\n", + B, + addr, + n, + got2, + expected); + std::abort(); + } + for (size_t w = 0; w < Bins::NUM_BITMAP_WORDS; w++) + { + if (Bins::raw_word(bm, w) != snapshot[w]) + { + std::printf( + "B=%zu add idempotent: addr=%zu n=%zu word %zu changed\n", + B, + addr, + n, + w); + std::abort(); + } + } + } + } + } + + /// With multiple blocks added, `find_for_request` must return the + /// *minimum* bin id whose blocks all serve the request, not just any + /// such bin id. + template + void check_bitmap_find_min() + { + using Bins = ArenaBinsTestAccess; + using Bitmap = typename Bins::Bitmap; + + struct Entry + { + size_t n_chunks; + typename Bins::bitmap_info_t info; + }; + + std::vector entries; + for_each_class_info([&](size_t n, const auto& info) { + for (const auto& e : entries) + { + if ( + e.info.start_word == info.start_word && + e.info.first_mask == info.first_mask && + e.info.second_mask == info.second_mask) + return; + } + entries.push_back({n, info}); + }); + + // For each request entry: pick three bin ids that all serve this + // request (the start_bit itself; a higher-exp bin; the topmost + // bin), set all three, and verify find_for_request returns the + // smallest of the three. + for (const auto& e : entries) + { + // Recover the absolute start bin from the precomputed layout. + size_t start_bit = e.info.start_word * snmalloc::bits::BITS + + snmalloc::bits::ctz(e.info.first_mask); + size_t a = start_bit; + size_t b = + start_bit + (Bins::BINS_PER_EXP - (start_bit % Bins::BINS_PER_EXP)); + size_t c = Bitmap::TOTAL_BINS - 1; + if (a >= Bitmap::TOTAL_BINS) + continue; + if (b >= Bitmap::TOTAL_BINS) + continue; + // a < b < c by construction (a < b since b - a > 0; b <= a + r + // <= start_bit + BINS_PER_EXP <= TOTAL_BINS - 1 = c only when + // start_bit far enough below; skip cases where it's not). + if (!(a < b && b < c)) + continue; + + Bitmap bm; + Bins::raw_set(bm, a); + Bins::raw_set(bm, b); + Bins::raw_set(bm, c); + size_t got = bm.find_for_request(e.n_chunks); + if (got != a) + { + std::printf( + "B=%zu find_min: n=%zu bits set {%zu,%zu,%zu} " + "got=%zu (expected min %zu)\n", + B, + e.n_chunks, + a, + b, + c, + got, + a); + std::abort(); + } + } + } + + /// Verify carve(): pre.base+pre.size == req.base; req.base aligned; + /// req.size == sc.size_chunks; post.base == req.end; spans equal. + template + void check_carve() + { + using Bins = ArenaBinsTestAccess; + using range_t = typename Bins::range_t; + + auto classes = collect_classes(64); + for (size_t addr = 0; addr < 32; addr++) + { + for (size_t n = 1; n <= 64; n++) + { + for (auto sc : classes) + { + size_t s = Bins::size_chunks(sc); + size_t a = Bins::align_chunks(sc); + if (!can_serve(addr, n, s, a)) + continue; + + auto cv = Bins::carve(range_t{addr, n}, s); + + // pre starts at the block's base. + if (cv.pre.base != addr) + { + std::printf( + "B=%zu carve pre.base != addr (addr=%zu n=%zu s=%zu)\n", + B, + addr, + n, + s); + std::abort(); + } + // pre.end == req.base. + if (cv.pre.base + cv.pre.size != cv.req.base) + { + std::printf("B=%zu carve pre.end != req.base\n", B); + std::abort(); + } + // req aligned. + if ((cv.req.base & (a - 1)) != 0) + { + std::printf( + "B=%zu carve req.base %zu not aligned to %zu\n", + B, + cv.req.base, + a); + std::abort(); + } + // req.size == sc.size_chunks. + if (cv.req.size != s) + { + std::printf( + "B=%zu carve req.size %zu != s %zu\n", B, cv.req.size, s); + std::abort(); + } + // req.end == post.base. + if (cv.req.base + cv.req.size != cv.post.base) + { + std::printf("B=%zu carve req.end != post.base\n", B); + std::abort(); + } + // post.end == block.end. + if (cv.post.base + cv.post.size != addr + n) + { + std::printf("B=%zu carve post.end != block.end\n", B); + std::abort(); + } + // pre.size + req.size + post.size == block.size. + if (cv.pre.size + cv.req.size + cv.post.size != n) + { + std::printf("B=%zu carve sizes don't sum to n\n", B); + std::abort(); + } + } + } + } + } + + template + void run_all() + { + std::printf("--- Running ArenaBinsTestAccess<%zu> tests ---\n", B); + check_chunk_sc_roundtrip(); + std::printf(" chunk_sc_t round-trip: OK\n"); + check_align_chunks(); + std::printf(" align_chunks: OK\n"); + check_to_exp_mant_equivalence(); + std::printf(" to_exp_mant runtime/_const equivalence: OK\n"); + check_info_consistency(); + std::printf(" *_info_for_request consistency: OK\n"); + check_bin_id_range(); + std::printf(" bin_index within-exp range: OK\n"); + check_bin_classification(/*max_addr=*/128, /*max_n=*/64); + std::printf(" bin classification vs bin_subsets predicate: OK\n"); + check_bitmap_smoke(); + std::printf(" Bitmap smoke: OK\n"); + check_bitmap_find_empty(); + std::printf(" Bitmap empty find returns SIZE_MAX: OK\n"); + check_bitmap_exhaustive_single_bit(); + std::printf(" Bitmap exhaustive single-bit find: OK\n"); + check_bitmap_multi_bit_random(); + std::printf(" Bitmap multi-bit random find: OK\n"); + check_bitmap_word_boundary(); + std::printf(" Bitmap word-boundary cases: OK\n"); + check_bitmap_bin_index_integration(); + std::printf(" Bitmap bin_index integration: OK\n"); + check_bitmap_add(); + std::printf(" Bitmap add classify+set+idempotent: OK\n"); + check_bitmap_find_min(); + std::printf(" Bitmap find_for_request returns minimum: OK\n"); + check_carve(); + std::printf(" carve splits aligned/unaligned blocks: OK\n"); + } + + /// A few concrete expected values, derived from the prototype's output, to + /// catch silent breakage of the canonical numbering. + void check_known_values() + { + using B2 = ArenaBinsTestAccess<2>; + + // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3, + // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8. + if (B2::size_chunks(B2::request(1)) != 1) + std::abort(); + if (B2::size_chunks(B2::request(8)) != 8) + std::abort(); + if (B2::size_chunks(B2::request(9)) != 10) + std::abort(); + if (B2::size_chunks(B2::request(11)) != 12) + std::abort(); + + // align_chunks: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, + // size 10 -> 2, size 12 -> 4, size 14 -> 2. + if (B2::align_chunks(B2::request(4)) != 4) + std::abort(); + if (B2::align_chunks(B2::request(5)) != 1) + std::abort(); + if (B2::align_chunks(B2::request(6)) != 2) + std::abort(); + if (B2::align_chunks(B2::request(8)) != 8) + std::abort(); + if (B2::align_chunks(B2::request(10)) != 2) + std::abort(); + + // BINS_PER_EXP must be 5 for B=2. + if (B2::BINS_PER_EXP != 5) + std::abort(); + + using B3 = ArenaBinsTestAccess<3>; + + if (B3::BINS_PER_EXP != 13) + std::abort(); + + using B1 = ArenaBinsTestAccess<1>; + if (B1::BINS_PER_EXP != 2) + std::abort(); + } +} // namespace + +int main(int, char**) +{ + setup(); + + check_known_values(); + std::printf("Known concrete values: OK\n"); + + run_all<1>(); + run_all<2>(); + run_all<3>(); + + std::printf("All ArenaBins tests passed.\n"); + return 0; +} From e8389c18e0e8eed7b6f7245e19efe0e14763b8f3 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 20 May 2026 13:20:28 +0100 Subject: [PATCH 04/15] Add RBTree::neighbours probe helper Adds a public RBTree method that returns the strict neighbours of a probe value K in a single root-to-leaf descent: - every left turn (parent > K) records the parent as the current successor candidate - every right turn (parent < K) records the parent as the current predecessor candidate At loop exit the tightest neighbours are returned as `stl::Pair{pred, succ}`; either component is `Rep::null` when no such neighbour exists. The "K not in tree" precondition is asserted via SNMALLOC_ASSERT and expands to nothing in Release. Arena, the planned caller, relies on the invariant that two free blocks cannot share a starting address. test_neighbours exercises the algorithm against std::set::lower_bound / upper_bound as oracle. Boundary probes (K=0, K=size+1) plus random probes that skip oracle hits keep every call within the precondition. The sweep reuses the existing test()'s size range but caps to the first few seeds per size to keep the per-test time budget in check. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/ds_core/redblacktree.h | 49 ++++++++++- src/test/func/redblack/redblack.cc | 123 ++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index e6ce73c24..0eaee84e1 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -1,6 +1,7 @@ #pragma once #include "snmalloc/stl/array.h" +#include "snmalloc/stl/utility.h" #include #include @@ -456,7 +457,7 @@ namespace snmalloc // insufficient to accurately display the tree, but it will still be // memory safe as the search code is bounded by the string size. static constexpr size_t max_depth = 128; - char s_indent[max_depth]; + char s_indent[max_depth] = {}; size_t end = 0; for (; end < max_depth - 1; end++) { @@ -787,6 +788,52 @@ namespace snmalloc return true; } + /** + * Return the strict neighbours of `value` in the tree: + * `(largest key < value, smallest key > value)`. Either component is + * `Rep::null` when no such neighbour exists. + * + * **Precondition**: `value` is not present in the tree. A single + * root-to-leaf descent then records both neighbours: every left + * turn (parent key > value) updates the successor candidate to the + * parent's key, every right turn updates the predecessor candidate. + * `SNMALLOC_CHECK` aborts in any build if a non-null `value` is + * encountered on the descent: a duplicate key would make + * `neighbours` return an arbitrary neighbour pair that the + * caller would consume as valid, corrupting dependent state. The + * check uses only one post-descent comparison because a duplicate + * key is always recorded into `pred` on the right-going branch + * (`compare(k, value)` is false when `k == value`). `Rep::null` + * can never be present in the tree, so probing with it is benign + * and exempt from the check. + */ + stl::Pair neighbours(K value) + { + K pred = Rep::null; + K succ = Rep::null; + + ChildRef cur = get_root(); + while (!cur.is_null()) + { + K k = cur; + if (Rep::compare(k, value)) + { + // k > value: go left; k is the tightest successor seen so far. + succ = k; + cur = get_dir(true, k); + } + else + { + pred = k; + cur = get_dir(false, k); + } + } + + SNMALLOC_CHECK(Rep::equal(pred, Rep::null) || !Rep::equal(pred, value)); + + return {pred, succ}; + } + RBPath get_root_path() { return RBPath(H{&root}); diff --git a/src/test/func/redblack/redblack.cc b/src/test/func/redblack/redblack.cc index 61fccb6d3..e47138be4 100644 --- a/src/test/func/redblack/redblack.cc +++ b/src/test/func/redblack/redblack.cc @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef SNMALLOC_TRACING @@ -207,6 +208,122 @@ void test(size_t size, unsigned int seed) } } +template +void test_neighbours(size_t size, unsigned int seed) +{ + xoroshiro::p64r32 rand(seed); + snmalloc::RBTree tree; + std::set oracle; + // Parallel vector keeps random-pick on remove O(1) instead of paying + // O(n) for std::advance over a std::set iterator. + std::vector entries; + + auto probe = [&](Rep::key k_probe) { + auto result = tree.neighbours(k_probe); + + Rep::key expected_pred = Rep::null; + Rep::key expected_succ = Rep::null; + auto it = oracle.lower_bound(k_probe); + if (it != oracle.begin()) + { + auto prev = it; + --prev; + expected_pred = *prev; + } + if (it != oracle.end()) + expected_succ = *it; + + if (result.first != expected_pred || result.second != expected_succ) + { + std::cout << "neighbours(" << k_probe << ") mismatch:" + << " got (" << result.first << ", " << result.second << ")" + << " expected (" << expected_pred << ", " << expected_succ + << ")" << std::endl; + abort(); + } + }; + + auto do_probes = [&]() { + // Boundary probes. Key 0 is Rep::null and is never inserted (insert + // keys are 1 + rand % size), and size + 1 is one above the maximum + // possible insert; both are guaranteed not to be in the tree. + probe(Rep::key(0)); + if (size + 1 <= 0xFFFF) + probe(Rep::key(size + 1)); + // Two random probes, skipping any that collide with the tree. + for (size_t p = 0; p < 2; p++) + { + Rep::key k = Rep::key(rand.next() % (size + 2)); + if (oracle.count(k) == 0) + probe(k); + } + }; + + // Empty tree: every probe must report (null, null). + do_probes(); + + bool first = true; + for (size_t i = 0; i < 20 * size; i++) + { + auto batch = 1 + rand.next() % (3 + (size / 2)); + auto op = rand.next() % 4; + if (op < 2 || first) + { + first = false; + for (auto j = batch; j > 0; j--) + { + auto k = Rep::key(1 + rand.next() % size); + if (tree.insert_elem(k)) + { + oracle.insert(k); + entries.push_back(k); + } + } + } + else if (op == 3) + { + for (auto j = batch; j > 0; j--) + { + if (entries.empty()) + break; + auto index = rand.next() % entries.size(); + Rep::key elem = entries[index]; + if (!tree.remove_elem(elem)) + { + std::cout << "Failed to remove element: " << elem << std::endl; + abort(); + } + entries.erase(entries.begin() + static_cast(index)); + oracle.erase(elem); + } + } + else + { + for (auto j = batch; j > 0; j--) + { + if (entries.empty()) + break; + auto min = tree.remove_min(); + Rep::key expected = *oracle.begin(); + if (min != expected) + { + std::cout << "remove_min mismatch: tree=" << min + << " oracle=" << expected << std::endl; + abort(); + } + oracle.erase(oracle.begin()); + entries.erase( + std::remove(entries.begin(), entries.end(), min), entries.end()); + } + } + + do_probes(); + + if (entries.empty()) + break; + } +} + int main(int argc, char** argv) { setup(); @@ -222,6 +339,11 @@ int main(int argc, char** argv) for (seed = 1; seed < 5 + (8 * size); seed++) { test(size, seed); + // Run the neighbours oracle on a handful of seeds per size: the + // full size range gives good tree-shape coverage, the seed cap + // keeps the extra cost from blowing the per-test time budget. + if (seed < 5) + test_neighbours(size, seed); } return 0; @@ -235,5 +357,6 @@ int main(int argc, char** argv) // Trace particular example test(size, seed); + test_neighbours(size, seed); return 0; } From c1f81522401f6daf2739d881890146f1a873687d Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 21 May 2026 15:02:02 +0100 Subject: [PATCH 05/15] Add Arena: dual-tree free-range manager with consolidation Introduce Arena, a free-range allocator that manages chunks within a bounded arena using a dual red-black-tree scheme: - Bin trees: one per size-class bin, for best-fit allocation lookups driven by a non-empty-bins bitmap. - Range tree: keyed by address, for O(log n) neighbour lookup during consolidation of adjacent free blocks. Key design decisions: - Single-chunk (min-size) blocks live only in bin tree 0, not the range tree, keeping range-tree overhead proportional to multi-chunk blocks. The min-size bin is probed as a fallback during consolidation. - Three-variant encoding (Min/TwoMin/Large) in pagemap metadata bits avoids a range-tree lookup for the common 1-chunk and 2-chunk cases. - WordRef handle and TreeRep template follow the existing BackendStateWordRef / BuddyChunkRep patterns from largebuddyrange.h. - Consolidation in add_block checks predecessor then successor, merging adjacent blocks and re-inserting the result. - remove_block uses Bins::carve to split oversized blocks, re-inserting remainders. Also: - Add neighbours() to RBTree: single-descent strict-neighbour query. - Add for_each() to RBTree: in-order traversal for invariant checking. - Make ArenaBins::bin_index public (sole consumer is Arena). - Add ArenaBins::Bitmap::test() for invariant verification. - Five-clause structural invariant gated on bool parameter (defaults to Debug), checked at entry/exit of add_block and remove_block. - Comprehensive test suite: word-level round-trips, tree operations, empty-state invariant, add/remove without consolidation, consolidation case matrix (8 pred/succ combinations), overflow detection, and randomised stress test with oracle validation (50 seeds x 500 ops). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 2 +- src/snmalloc/backend_helpers/arena.h | 501 +++++++++ src/snmalloc/backend_helpers/arenabins.h | 11 +- src/snmalloc/ds_core/redblacktree.h | 21 + src/test/func/arena/arena.cc | 1283 ++++++++++++++++++++++ 5 files changed, 1816 insertions(+), 2 deletions(-) create mode 100644 src/snmalloc/backend_helpers/arena.h create mode 100644 src/test/func/arena/arena.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 009efebb3..a11c6182c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS - aligned_dealloc arenabins + aligned_dealloc arena arenabins bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h new file mode 100644 index 000000000..0047b8d10 --- /dev/null +++ b/src/snmalloc/backend_helpers/arena.h @@ -0,0 +1,501 @@ +#pragma once + +#include "../ds_core/redblacktree.h" +#include "../ds_core/sizeclassconfig.h" +#include "../stl/array.h" +#include "../stl/utility.h" +#include "arenabins.h" + +#include +#include + +namespace snmalloc +{ + struct ArenaTestAccess; + + /** + * Size encoding for a free block's first pagemap entry. + * Min: exactly 1 chunk (no range-tree entry). + * EvenTwo: exactly 2 chunks, 2-aligned; can serve size-2 requests. + * OddTwo: exactly 2 chunks, NOT 2-aligned; in range tree but + * placed in a size-1 bin (cannot serve aligned size-2 requests). + * Large: 3+ chunks; precise size stored in a separate entry. + */ + enum class ArenaVariant : uint8_t + { + Min = 0, + EvenTwo = 1, + OddTwo = 2, + Large = 3 + }; + + /** + * Manages free ranges within a single bounded arena using a dual-tree + * scheme (bin trees for allocation, range tree for consolidation). + * + * `Rep` provides word-level pagemap access: + * - `ref_word(direction, addr) -> uintptr_t*`: bin-tree child slot + * (left/right pointer in the first pagemap entry). + * - `ref_range_word(direction, addr) -> uintptr_t*`: range-tree + * child slot (left/right pointer in the second pagemap entry). + * - `get_variant(addr)` / `set_variant(addr, v)` + * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` + * + * `MIN_CHUNKS_BITS`: log2 of minimum allocation unit in chunks (0 for + * this phase — 1-chunk minimum). + * + * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that + * reach this size overflow and are returned to the caller. + */ + template + class Arena + { + static_assert(MIN_CHUNKS_BITS == 0, "Only MIN_CHUNKS_BITS == 0 supported"); + static_assert(MAX_CHUNKS_BITS > MIN_CHUNKS_BITS); + static_assert(MAX_CHUNKS_BITS < bits::BITS); + + static constexpr size_t B = 2; + using Bins = ArenaBins; + + static_assert( + bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); + + // Bit layout constants. + static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static constexpr uintptr_t VARIANT_MASK = uintptr_t(0x3) << 9; + static constexpr uintptr_t META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t BACKEND_RESERVED_MASK = 0xFF; + + static_assert((META_MASK & BACKEND_RESERVED_MASK) == 0); + static_assert(META_MASK < MIN_CHUNK_SIZE); + + // ---- Handle: thin proxy around uintptr_t* ---- + // + // Matches BackendStateWordRef's interface: wraps a pointer to a + // word slot (tree root field or pagemap word). Constructed from + // &root or from Rep::ref_word / Rep::ref_range_word. + struct WordRef + { + uintptr_t* val{nullptr}; + + constexpr WordRef() = default; + + constexpr WordRef(uintptr_t* p) : val(p) {} + + uintptr_t get() const + { + return *val; + } + + WordRef& operator=(uintptr_t v) + { + *val = v; + return *this; + } + + bool operator!=(const WordRef& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + + // ---- TreeRep: RBTree Rep parameterised on which word accessor to use ---- + // + // `RefFn` selects the pagemap entry: ref_word for the bin tree, + // ref_range_word for the range tree. + template + struct TreeRep + { + using Handle = WordRef; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Contents get(Handle h) + { + return h.get() & ~META_MASK; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & META_MASK); + } + + static Handle ref(bool direction, Contents k) + { + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + return Handle{RefFn(direction, k)}; + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return "TreeRep"; + } + }; + + using BinRep = TreeRep; + using RangeRep = TreeRep; + + using BinTree = RBTree; + using RangeTree = RBTree; + + stl::Array bin_trees{}; + RangeTree range_tree{}; + typename Bins::Bitmap bitmap{}; + + // ---- Address-unit helpers ---- + + static size_t addr_to_chunk(uintptr_t a) + { + return a >> MIN_CHUNK_BITS; + } + + static uintptr_t chunk_to_addr(size_t c) + { + return static_cast(c) << MIN_CHUNK_BITS; + } + + // ---- Metadata helpers ---- + + static ArenaVariant + variant_of(size_t size_chunks, size_t chunk_index) + { + if (size_chunks == 1) + return ArenaVariant::Min; + if (size_chunks == 2) + return (chunk_index & 1) == 0 ? ArenaVariant::EvenTwo : + ArenaVariant::OddTwo; + return ArenaVariant::Large; + } + + static stl::Pair range_from_addr(uintptr_t a) + { + if (a == 0) + return {0, 0}; + auto v = Rep::get_variant(a); + switch (v) + { + case ArenaVariant::Min: + return {a, 1}; + case ArenaVariant::EvenTwo: + case ArenaVariant::OddTwo: + return {a, 2}; + case ArenaVariant::Large: + return {a, Rep::get_large_size_chunks(a)}; + } + SNMALLOC_ASSERT(false); + return {0, 0}; + } + + bool contains_min(uintptr_t a) + { + auto path = bin_trees[0].get_root_path(); + return bin_trees[0].find(path, a) && + Rep::get_variant(a) == ArenaVariant::Min; + } + + void insert_block(uintptr_t addr, size_t size_chunks) + { + Rep::set_variant(addr, variant_of(size_chunks, addr_to_chunk(addr))); + if (size_chunks >= 3) + Rep::set_large_size_chunks(addr, size_chunks); + + auto chunk_range = + typename Bins::range_t{addr_to_chunk(addr), size_chunks}; + size_t bin = bitmap.add(chunk_range); + bin_trees[bin].insert_elem(addr); + if (size_chunks >= 2) + range_tree.insert_elem(addr); + } + + void unlink_block(uintptr_t addr, size_t size_chunks) + { + auto chunk_range = + typename Bins::range_t{addr_to_chunk(addr), size_chunks}; + size_t bin = bitmap.add(chunk_range); + bin_trees[bin].remove_elem(addr); + if (size_chunks >= 2) + range_tree.remove_elem(addr); + if (bin_trees[bin].is_empty()) + bitmap.clear(bin); + } + + friend struct ArenaTestAccess; + + public: + using addr_t = uintptr_t; + + constexpr Arena() = default; + + /** + * Add a free block at `addr` with `size_chunks` chunks. The block + * is consolidated with any adjacent free neighbours. Returns + * `{0, 0}` on success. If consolidation produces a block spanning + * the entire arena (`>= 2^MAX_CHUNKS_BITS` chunks), returns + * `{consolidated_addr, consolidated_size}` and the arena is empty. + */ + stl::Pair add_block(addr_t addr, size_t size_chunks) + { + check_invariant(); + SNMALLOC_ASSERT(addr != 0); + SNMALLOC_ASSERT((addr & (MIN_CHUNK_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size_chunks > 0); + SNMALLOC_ASSERT(size_chunks < bits::one_at_bit(MAX_CHUNKS_BITS)); + + uintptr_t c_addr = addr; + size_t c_size = size_chunks; + + auto merge = [&](uintptr_t n_addr, size_t n_size) { + unlink_block(n_addr, n_size); + if (n_addr < c_addr) + c_addr = n_addr; + c_size += n_size; + }; + + // Check range tree for non-min neighbours. + auto [p_key, s_key] = range_tree.neighbours(addr); + + // Predecessor: check range tree, then fall back to min-size bin. + auto [pa, ps] = range_from_addr(p_key); + if (pa + ps * MIN_CHUNK_SIZE == addr) + merge(pa, ps); + else if (addr >= MIN_CHUNK_SIZE && contains_min(addr - MIN_CHUNK_SIZE)) + merge(addr - MIN_CHUNK_SIZE, 1); + + // Successor: check range tree, then fall back to min-size bin. + auto [sa, ss] = range_from_addr(s_key); + uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; + if (sa == succ_addr) + merge(sa, ss); + else if (succ_addr > addr && contains_min(succ_addr)) + merge(succ_addr, 1); + + // Arena-scale overflow: consolidated block spans the full arena. + if (c_size >= bits::one_at_bit(MAX_CHUNKS_BITS)) + return {c_addr, c_size}; + + // Insert consolidated block. + insert_block(c_addr, c_size); + + check_invariant(); + return {0, 0}; + } + + /** + * Remove a block of at least `n_chunks` chunks. Returns + * `{addr, actual_size}` on success, `{0, 0}` if nothing fits. + * Any leftover from carving is re-inserted via `add_block`. + */ + stl::Pair remove_block(size_t n_chunks) + { + check_invariant(); + if (n_chunks == 0) + return {0, 0}; + + if (n_chunks > Bins::max_supported_chunks()) + return {0, 0}; + + size_t bin_id = bitmap.find_for_request(n_chunks); + if (bin_id == SIZE_MAX) + return {0, 0}; + + // remove_min returns the lowest-address entry (since compare + // is k1 > k2). Read metadata after removal — remove_elem + // does not clear node contents (redblacktree.h:535). + uintptr_t block_addr = bin_trees[bin_id].remove_min(); + auto [_, block_size] = range_from_addr(block_addr); + (void)_; + + if (block_size >= 2) + range_tree.remove_elem(block_addr); + + if (bin_trees[bin_id].is_empty()) + bitmap.clear(bin_id); + + // Carve the requested chunk count from the block. + auto carved = + Bins::carve({addr_to_chunk(block_addr), block_size}, n_chunks); + + // Re-insert non-empty remainders. By the maximally-consolidated + // invariant, these remainders have no adjacent free neighbours. + if (carved.pre.size != 0) + { + insert_block(chunk_to_addr(carved.pre.base), carved.pre.size); + } + + if (carved.post.size != 0) + { + insert_block(chunk_to_addr(carved.post.base), carved.post.size); + } + + check_invariant(); + return {chunk_to_addr(carved.req.base), carved.req.size}; + } + + /** + * Structural invariant. Runs when `enabled` is true; defaults to + * `Debug` so release tests can pass `true` explicitly. + * + * Five clauses are verified: + * 1. Maximally consolidated — no adjacent free blocks could be + * merged: (a) no two non-min range-tree entries touch across + * a consolidatable boundary, (b) no non-min entry touches a + * min entry, (c) no two min entries are adjacent. + * 2. Cross-tree consistency — every range-tree entry appears in + * exactly one bin tree, and every non-min bin-tree entry + * appears in the range tree. + * 3. Bin classification — every bin-tree entry sits in the bin + * its size selects. + * 4. Bitmap consistency — the non-empty bin bit is set iff the + * corresponding bin tree has entries. + * 5. Variant-tag consistency — each entry's pagemap variant tag + * matches the tag implied by its address and size, and Large + * variant entries carry the correct stored size. + */ + void check_invariant(bool enabled = Debug) + { + if (!enabled) + return; + + // 1a. No two adjacent non-min blocks. + { + uintptr_t prev_addr = 0; + size_t prev_size = 0; + bool prev_valid = false; + range_tree.for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (prev_valid) + SNMALLOC_ASSERT(prev_addr + prev_size * MIN_CHUNK_SIZE != a); + prev_addr = a; + prev_size = s; + prev_valid = true; + }); + } + + // 1b. No non-min block adjacent to a min block. + range_tree.for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (a >= MIN_CHUNK_SIZE) + SNMALLOC_ASSERT(!contains_min(a - MIN_CHUNK_SIZE)); + SNMALLOC_ASSERT(!contains_min(a + s * MIN_CHUNK_SIZE)); + }); + + // 1c. No two adjacent min blocks. + { + uintptr_t prev = 0; + bool prev_valid = false; + bin_trees[0].for_each([&](uintptr_t node) { + if (Rep::get_variant(node) != ArenaVariant::Min) + return; + if (prev_valid) + SNMALLOC_ASSERT(prev + MIN_CHUNK_SIZE != node); + prev = node; + prev_valid = true; + }); + } + + // 2. Cross-tree consistency. + { + size_t range_tree_count = 0; + size_t bin_tree_nonmin_count = 0; + + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + if (s >= 2) + { + auto path = range_tree.get_root_path(); + SNMALLOC_ASSERT(range_tree.find(path, node)); + bin_tree_nonmin_count++; + } + }); + } + + range_tree.for_each([&](uintptr_t node) { + range_tree_count++; + auto [a, s] = range_from_addr(node); + auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; + size_t expected_bin = Bins::bin_index(chunk_range); + auto path = bin_trees[expected_bin].get_root_path(); + SNMALLOC_ASSERT(bin_trees[expected_bin].find(path, node)); + }); + + SNMALLOC_ASSERT(bin_tree_nonmin_count == range_tree_count); + } + + // 3. Bin classification correctness. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + bin_trees[bin].for_each([&](uintptr_t node) { + auto [a, s] = range_from_addr(node); + auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; + size_t expected_bin = Bins::bin_index(chunk_range); + SNMALLOC_ASSERT(expected_bin == bin); + }); + } + + // 4. Bitmap consistency. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + bool has_entries = !bin_trees[bin].is_empty(); + bool bit_set = bitmap.test(bin); + SNMALLOC_ASSERT(has_entries == bit_set); + } + + // 5. Variant-tag consistency. + for (size_t bin = 0; bin < Bins::Bitmap::TOTAL_BINS; bin++) + { + bin_trees[bin].for_each([&](uintptr_t node) { + auto v = Rep::get_variant(node); + auto [a, s] = range_from_addr(node); + SNMALLOC_ASSERT(v == variant_of(s, addr_to_chunk(a))); + if (v == ArenaVariant::Large) + SNMALLOC_ASSERT(Rep::get_large_size_chunks(node) == s); + }); + } + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index a213650ed..e9a76253b 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -159,6 +159,7 @@ namespace snmalloc return table_.carve_info[raw]; } + public: /** * Bin id of `block`. Operates on arbitrary chunk counts, not just * exact size classes. `block.size` must be >= 1. @@ -193,7 +194,6 @@ namespace snmalloc return table_.exp_bin_base[e] + offset; } - public: /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. static constexpr size_t max_supported_chunks() { @@ -291,6 +291,15 @@ namespace snmalloc return bin_id; } + /// Read-only test: is the bit for `bin_id` set? + /// Used by `Arena::invariant()`. + bool test(size_t bin_id) const + { + SNMALLOC_ASSERT(bin_id < TOTAL_BINS); + return (words_[bin_id / bits::BITS] & + (size_t(1) << (bin_id & (bits::BITS - 1)))) != 0; + } + /// Mark bin `bin_id` empty. Caller must ensure the bin's tree /// is actually empty; the bitmap does not consult the trees. SNMALLOC_FAST_PATH void clear(size_t bin_id) diff --git a/src/snmalloc/ds_core/redblacktree.h b/src/snmalloc/ds_core/redblacktree.h index 0eaee84e1..3fda3b0c9 100644 --- a/src/snmalloc/ds_core/redblacktree.h +++ b/src/snmalloc/ds_core/redblacktree.h @@ -838,5 +838,26 @@ namespace snmalloc { return RBPath(H{&root}); } + + /** + * Call `fn(key)` for every key in ascending order. + */ + template + void for_each(Fn&& fn) + { + for_each_impl(get_root(), fn); + } + + private: + template + static void for_each_impl(ChildRef node, Fn& fn) + { + if (node.is_null()) + return; + K k = node; + for_each_impl(get_dir(true, k), fn); + fn(k); + for_each_impl(get_dir(false, k), fn); + } }; } // namespace snmalloc diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc new file mode 100644 index 000000000..2c746b4b0 --- /dev/null +++ b/src/test/func/arena/arena.cc @@ -0,0 +1,1283 @@ +/** + * Unit tests for Arena. + * + * Exercises the Rep adapters (BinRep, RangeRep), RBTree integration, + * add_block with consolidation, remove_block with carving, the + * five-clause invariant, and a randomised stress test with oracle. + */ + +#include "test/setup.h" +#include "test/xoroshiro.h" + +#include +#include +#include +#include +#include +#include + +#ifndef SNMALLOC_TRACING +# define SNMALLOC_TRACING +#endif +#include "test/snmalloc_testlib.h" + +#include + +namespace snmalloc +{ + // ---- MockRep: array-backed storage for testing ---- + + // Each chunk-aligned address maps to a mock_entry via its chunk index. + // word1/word2 hold bin-tree children; range_word1/range_word2 hold + // range-tree children. variant and large_size_chunks hold metadata. + struct mock_entry + { + uintptr_t word1{0}; + uintptr_t word2{0}; + uintptr_t range_word1{0}; + uintptr_t range_word2{0}; + ArenaVariant variant{ArenaVariant::Min}; + size_t large_size_chunks{0}; + }; + + // Size the array for the largest test arena + trailing room. + static constexpr size_t MOCK_ARENA_CHUNKS = 1024; + static mock_entry mock_store[MOCK_ARENA_CHUNKS]; + + static void reset_mock_store() + { + for (size_t i = 0; i < MOCK_ARENA_CHUNKS; i++) + mock_store[i] = mock_entry{}; + } + + static size_t mock_index(uintptr_t addr) + { + size_t idx = addr >> MIN_CHUNK_BITS; + SNMALLOC_ASSERT(idx < MOCK_ARENA_CHUNKS); + SNMALLOC_ASSUME(idx < MOCK_ARENA_CHUNKS); + return idx; + } + + struct MockRep + { + static ArenaVariant get_variant(uintptr_t addr) + { + return mock_store[mock_index(addr)].variant; + } + + static void set_variant(uintptr_t addr, ArenaVariant v) + { + mock_store[mock_index(addr)].variant = v; + } + + static uintptr_t* ref_word(bool direction, uintptr_t addr) + { + auto& e = mock_store[mock_index(addr)]; + return direction ? &e.word1 : &e.word2; + } + + static uintptr_t* ref_range_word(bool direction, uintptr_t addr) + { + auto& e = mock_store[mock_index(addr)]; + return direction ? &e.range_word1 : &e.range_word2; + } + + static size_t get_large_size_chunks(uintptr_t addr) + { + return mock_store[mock_index(addr)].large_size_chunks; + } + + static void set_large_size_chunks(uintptr_t addr, size_t s) + { + mock_store[mock_index(addr)].large_size_chunks = s; + } + }; + + // ---- Test access ---- + struct ArenaTestAccess + { + template + static auto& get_bin_trees(Arena& a) + { + return a.bin_trees; + } + + template + static auto& get_range_tree(Arena& a) + { + return a.range_tree; + } + + template + static auto& get_bitmap(Arena& a) + { + return a.bitmap; + } + }; + + // Convenience: chunk-aligned address from chunk index. + static uintptr_t chunk_addr(size_t chunk_idx) + { + return static_cast(chunk_idx) << MIN_CHUNK_BITS; + } + + // ---- Test types ---- + // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks. + template + using TestArena = Arena; + + using Bins = ArenaBins<2>; + + // ================================================================== + // (A) Accessor round-trips + // ================================================================== + static void test_variant_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(10); + + MockRep::set_variant(a, ArenaVariant::Min); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Min); + + MockRep::set_variant(a, ArenaVariant::EvenTwo); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::EvenTwo); + + MockRep::set_variant(a, ArenaVariant::Large); + SNMALLOC_ASSERT(MockRep::get_variant(a) == ArenaVariant::Large); + + printf(" Variant round-trip: OK\n"); + } + + static void test_large_size_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(20); + + for (size_t s : + {size_t{3}, + size_t{7}, + size_t{15}, + size_t{63}, + size_t{255}, + size_t{1000}}) + { + MockRep::set_large_size_chunks(a, s); + SNMALLOC_ASSERT(MockRep::get_large_size_chunks(a) == s); + } + + printf(" Large-size round-trip: OK\n"); + } + + static void test_word_roundtrip() + { + reset_mock_store(); + uintptr_t a = chunk_addr(5); + + uintptr_t v1 = chunk_addr(10); + uintptr_t v2 = chunk_addr(20); + + *MockRep::ref_word(true, a) = v1; + *MockRep::ref_word(false, a) = v2; + SNMALLOC_ASSERT(*MockRep::ref_word(true, a) == v1); + SNMALLOC_ASSERT(*MockRep::ref_word(false, a) == v2); + + *MockRep::ref_range_word(true, a) = v2; + *MockRep::ref_range_word(false, a) = v1; + SNMALLOC_ASSERT(*MockRep::ref_range_word(true, a) == v2); + SNMALLOC_ASSERT(*MockRep::ref_range_word(false, a) == v1); + + printf(" Word round-trip: OK\n"); + } + + // ================================================================== + // (B) RBTree / RBTree smoke + // ================================================================== + + // We can't directly instantiate BinRep/RangeRep outside Arena + // since they are private nested types. Instead, test them through + // Arena's add_block/remove_block which exercise both trees. + // For smoke testing of tree operations directly, we test through + // the Arena's own invariant and operation correctness. + + static void test_rbtree_smoke_via_arena() + { + reset_mock_store(); + TestArena<8> arena; + arena.check_invariant(true); + + // Insert a few non-adjacent blocks. + uintptr_t a1 = chunk_addr(10); + uintptr_t a2 = chunk_addr(20); + uintptr_t a3 = chunk_addr(30); + + arena.add_block(a1, 3); + arena.check_invariant(true); + + arena.add_block(a2, 5); + arena.check_invariant(true); + + arena.add_block(a3, 1); + arena.check_invariant(true); + + // Remove them. + auto r1 = arena.remove_block(1); + SNMALLOC_ASSERT(r1.first != 0); + UNUSED(r1); + arena.check_invariant(true); + + auto r2 = arena.remove_block(3); + SNMALLOC_ASSERT(r2.first != 0); + UNUSED(r2); + arena.check_invariant(true); + + auto r3 = arena.remove_block(5); + SNMALLOC_ASSERT(r3.first != 0); + UNUSED(r3); + arena.check_invariant(true); + + printf(" RBTree smoke via arena: OK\n"); + } + + // ================================================================== + // (C) Empty-state invariant + // ================================================================== + template + static void test_empty_invariant() + { + reset_mock_store(); + TestArena arena; + arena.check_invariant(true); + printf(" Empty invariant (K=%zu): OK\n", K); + } + + // ================================================================== + // (D) add_block without consolidation + // ================================================================== + static void test_add_no_consolidation() + { + reset_mock_store(); + TestArena<8> arena; + + // Insert several non-adjacent blocks of various sizes. + struct + { + size_t chunk_idx; + size_t size; + } blocks[] = { + {10, 1}, + {20, 2}, + {30, 3}, + {40, 5}, + {50, 9}, + }; + + for (auto& b : blocks) + { + auto result = arena.add_block(chunk_addr(b.chunk_idx), b.size); + SNMALLOC_ASSERT(result.first == 0 && result.second == 0); + UNUSED(result); + arena.check_invariant(true); + } + + printf(" add_block without consolidation: OK\n"); + } + + // ================================================================== + // (E) remove_block exact-class + carving + // ================================================================== + static void test_remove_exact() + { + reset_mock_store(); + TestArena<8> arena; + + // Insert 3 blocks of size 5 at non-adjacent locations. + arena.add_block(chunk_addr(10), 5); + arena.add_block(chunk_addr(20), 5); + arena.add_block(chunk_addr(30), 5); + arena.check_invariant(true); + + // Remove 3 exact-size blocks. + for (int i = 0; i < 3; i++) + { + auto r = arena.remove_block(5); + SNMALLOC_ASSERT(r.first != 0); + SNMALLOC_ASSERT(r.second == 5); + UNUSED(r); + arena.check_invariant(true); + } + + // Arena should be empty now. + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first == 0); + UNUSED(r); + + printf(" remove_block exact: OK\n"); + } + + static void test_remove_carving() + { + reset_mock_store(); + TestArena<8> arena; + + // Insert one block of size 10. + arena.add_block(chunk_addr(10), 10); + arena.check_invariant(true); + + // Request size 3 — should carve from the 10-chunk block. + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first != 0); + // The carved piece should be exactly what Bins::carve produces. + auto carved = Bins::carve({10, 10}, 3); + SNMALLOC_ASSERT(r.second == carved.req.size); + UNUSED(r); + arena.check_invariant(true); + + // The remainders should still be in the arena. + // We can try to remove everything that's left. + size_t remaining = 10 - carved.req.size; + while (remaining > 0) + { + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first != 0); + arena.check_invariant(true); + remaining -= r2.second; + } + + // Should be empty. + auto r3 = arena.remove_block(1); + SNMALLOC_ASSERT(r3.first == 0); + UNUSED(r3); + + printf(" remove_block carving: OK\n"); + } + + // ================================================================== + // (F) Consolidation case matrix + // ================================================================== + + // Helper: insert a block, verify invariant, return nothing. + template + static void + add_and_check(TestArena& arena, size_t chunk_idx, size_t size_chunks) + { + auto result = arena.add_block(chunk_addr(chunk_idx), size_chunks); + SNMALLOC_ASSERT(result.first == 0 && result.second == 0); + UNUSED(result); + arena.check_invariant(true); + } + + // Drain the arena by removing 1-chunk blocks until empty. + // Returns the total chunks removed. + template + static size_t drain_arena(TestArena& arena) + { + size_t total = 0; + while (true) + { + auto r = arena.remove_block(1); + if (r.first == 0) + break; + total += r.second; + arena.check_invariant(true); + } + return total; + } + + // Case 12: P-only, P min (size 1). + static void test_consolidation_p_min() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 11, 3); + + // Should have consolidated into a single 4-chunk block. + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" Consolidation P-only, P min: OK\n"); + } + + // Case 13: P-only, P non-min. + static void test_consolidation_p_nonmin() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 3); + add_and_check(arena, 13, 2); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 5); + UNUSED(total); + + printf(" Consolidation P-only, P non-min: OK\n"); + } + + // Case 14: S-only, S min. + static void test_consolidation_s_min() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 14, 1); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" Consolidation S-only, S min: OK\n"); + } + + // Case 15: S-only, S non-min. + static void test_consolidation_s_nonmin() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 14, 4); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation S-only, S non-min: OK\n"); + } + + // Case 16: P+S, both min. + static void test_consolidation_ps_both_min() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 12, 1); + add_and_check(arena, 11, 1); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 3); + UNUSED(total); + + printf(" Consolidation P+S, both min: OK\n"); + } + + // Case 17: P+S, P min, S non-min. + static void test_consolidation_ps_p_min_s_nonmin() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 1); + add_and_check(arena, 14, 3); + add_and_check(arena, 11, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation P+S, P min, S non-min: OK\n"); + } + + // Case 18: P+S, P non-min, S min. + static void test_consolidation_ps_p_nonmin_s_min() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 3); + add_and_check(arena, 16, 1); + add_and_check(arena, 13, 3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 7); + UNUSED(total); + + printf(" Consolidation P+S, P non-min, S min: OK\n"); + } + + // Case 19: P+S, both non-min. + static void test_consolidation_ps_both_nonmin() + { + reset_mock_store(); + TestArena<8> arena; + add_and_check(arena, 10, 4); + add_and_check(arena, 19, 5); + add_and_check(arena, 14, 5); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 14); + UNUSED(total); + + printf(" Consolidation P+S, both non-min: OK\n"); + } + + // ================================================================== + // (F2) OddTwo — unaligned size-2 blocks + // ================================================================== + + static void test_oddtwo_variant() + { + // Odd chunk index → OddTwo, even → EvenTwo. + reset_mock_store(); + TestArena<8> arena; + + // Odd address: chunk 11, size 2 + arena.add_block(chunk_addr(11), 2); + SNMALLOC_ASSERT( + MockRep::get_variant(chunk_addr(11)) == ArenaVariant::OddTwo); + arena.check_invariant(true); + + // Even address: chunk 20, size 2 + arena.add_block(chunk_addr(20), 2); + SNMALLOC_ASSERT( + MockRep::get_variant(chunk_addr(20)) == ArenaVariant::EvenTwo); + arena.check_invariant(true); + + // Both should be in the range tree. + auto& rt = ArenaTestAccess::get_range_tree(arena); + auto p1 = rt.get_root_path(); + SNMALLOC_ASSERT(rt.find(p1, chunk_addr(11))); + auto p2 = rt.get_root_path(); + SNMALLOC_ASSERT(rt.find(p2, chunk_addr(20))); + + // OddTwo (chunk 11) should be in bin 0 (size-1 servable set). + auto& bt0 = ArenaTestAccess::get_bin_trees(arena)[0]; + auto p3 = bt0.get_root_path(); + SNMALLOC_ASSERT(bt0.find(p3, chunk_addr(11))); + UNUSED(p1, p2, p3); + + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" OddTwo variant tagging: OK\n"); + } + + static void test_oddtwo_contains_min_filter() + { + // contains_min must not match OddTwo entries. + reset_mock_store(); + TestArena<8> arena; + + // Add OddTwo block at chunk 11 (odd, size 2). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add a size-1 block at chunk 14, non-adjacent. + arena.add_block(chunk_addr(14), 1); + arena.check_invariant(true); + + // Now add chunk 13 (size 1). Its successor check should NOT + // pick up chunk 11's OddTwo entry via contains_min. It should + // just insert as size 1. + arena.add_block(chunk_addr(13), 1); + arena.check_invariant(true); + + // Chunk 13 should consolidate with chunk 14 (min successor), + // but NOT with chunk 11's OddTwo (range tree handles that). + // Drain to verify total. + size_t total = drain_arena(arena); + SNMALLOC_ASSERT(total == 4); + UNUSED(total); + + printf(" OddTwo contains_min filter: OK\n"); + } + + static void test_oddtwo_consolidation() + { + // OddTwo block should consolidate via the range tree. + reset_mock_store(); + TestArena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add adjacent block at chunk 13 (size 1). + // Range tree finds OddTwo at 11 as predecessor? No — chunk 13's + // predecessor in range tree is chunk 11 (size 2, ends at 13). + // So they should consolidate into size 3 at chunk 11. + arena.add_block(chunk_addr(13), 1); + arena.check_invariant(true); + + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first == chunk_addr(11)); + SNMALLOC_ASSERT(r.second == 3); + UNUSED(r); + + printf(" OddTwo consolidation (successor): OK\n"); + } + + static void test_oddtwo_consolidation_pred() + { + // Consolidation where the new block is a predecessor of OddTwo. + reset_mock_store(); + TestArena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Add block at chunk 10 (size 1). OddTwo at 11 is the successor + // in the range tree → consolidate into size 3 at chunk 10. + arena.add_block(chunk_addr(10), 1); + arena.check_invariant(true); + + auto r = arena.remove_block(3); + SNMALLOC_ASSERT(r.first == chunk_addr(10)); + SNMALLOC_ASSERT(r.second == 3); + UNUSED(r); + + printf(" OddTwo consolidation (predecessor): OK\n"); + } + + static void test_oddtwo_remove_carve() + { + // remove_block(1) from an OddTwo block should carve correctly. + reset_mock_store(); + TestArena<8> arena; + + // Add OddTwo at chunk 11 (odd, size 2). + arena.add_block(chunk_addr(11), 2); + arena.check_invariant(true); + + // Remove 1 chunk. Should carve from the OddTwo block. + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first != 0); + SNMALLOC_ASSERT(r.second == 1); + arena.check_invariant(true); + + // The remainder (1 chunk) should be Min variant. + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first != 0); + SNMALLOC_ASSERT(r2.second == 1); + UNUSED(r, r2); + + // Arena should be empty now. + auto r3 = arena.remove_block(1); + SNMALLOC_ASSERT(r3.first == 0); + UNUSED(r3); + + printf(" OddTwo remove + carve: OK\n"); + } + + // ================================================================== + // (G) Overflow — arena-scale consolidation + // ================================================================== + static void test_overflow() + { + // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. + reset_mock_store(); + TestArena<4> arena; + + constexpr size_t BASE = 16; + + // Step 1: add even-indexed chunks as individual blocks (8 blocks). + for (size_t i = 0; i < 16; i += 2) + { + arena.add_block(chunk_addr(BASE + i), 1); + arena.check_invariant(true); + } + + // Step 2: fill odd-indexed gaps. Each add consolidates with its + // even-indexed neighbours. The last add completes the arena. + for (size_t i = 1; i < 16; i += 2) + { + arena.add_block(chunk_addr(BASE + i), 1); + // Don't check invariant on the last add — it returns overflow. + if (i < 15) + { + arena.check_invariant(true); + } + } + + // The last add should have triggered overflow (16 chunks = 2^4). + auto r = arena.remove_block(1); + SNMALLOC_ASSERT(r.first == 0); + UNUSED(r); + + printf(" Overflow (arena-scale consolidation): OK\n"); + } + + static void test_overflow_precise() + { + // K=4 → 16-chunk arena. Use base offset 16 to avoid address 0. + reset_mock_store(); + TestArena<4> arena; + + constexpr size_t BASE = 16; + + arena.add_block(chunk_addr(BASE), 8); + arena.check_invariant(true); + + // Adding [BASE+8, BASE+16) consolidates to 16 chunks = 2^4 → overflow. + auto r = arena.add_block(chunk_addr(BASE + 8), 8); + SNMALLOC_ASSERT(r.first == chunk_addr(BASE)); + SNMALLOC_ASSERT(r.second == 16); + UNUSED(r); + + auto r2 = arena.remove_block(1); + SNMALLOC_ASSERT(r2.first == 0); + UNUSED(r2); + + printf(" Overflow precise: OK\n"); + } + + // ================================================================== + // (H) Randomised stress with oracle + // ================================================================== + + // Oracle: std::set of (addr_chunks, size_chunks) representing + // maximally-consolidated free set. + struct OracleRange + { + size_t addr; // in chunk units + size_t size; // in chunk units + + bool operator<(const OracleRange& o) const + { + return addr < o.addr; + } + + bool operator==(const OracleRange& o) const + { + return addr == o.addr && size == o.size; + } + }; + + class Oracle + { + std::set ranges; + size_t base_offset; // chunk offset to match arena addresses + + public: + Oracle() : base_offset(0) {} + + Oracle(size_t base) : base_offset(base) {} + + void add(size_t addr_chunks, size_t size_chunks) + { + OracleRange key{addr_chunks, size_chunks}; + auto it = ranges.lower_bound(key); + + size_t new_addr = addr_chunks; + size_t new_size = size_chunks; + + if (it != ranges.end() && it->addr == new_addr + new_size) + { + new_size += it->size; + it = ranges.erase(it); + } + + if (it != ranges.begin()) + { + auto prev = std::prev(it); + if (prev->addr + prev->size == new_addr) + { + new_addr = prev->addr; + new_size += prev->size; + ranges.erase(prev); + } + } + + ranges.insert({new_addr, new_size}); + } + + // Returns {addr_chunks, size_chunks} or {0, 0} if nothing fits. + // addr_chunks is oracle-relative (without base offset). + std::pair remove(size_t n_chunks) + { + if (n_chunks == 0 || n_chunks > Bins::max_supported_chunks()) + return {0, 0}; + + // Mirror the arena exactly: build a bitmap using arena-offset + // addresses (so bin classification matches), then find_for_request. + typename Bins::Bitmap bm{}; + std::map::iterator>> by_bin; + + for (auto it = ranges.begin(); it != ranges.end(); ++it) + { + // Use base-offset address for bin classification. + Bins::range_t r{base_offset + it->addr, it->size}; + size_t bin = bm.add(r); + by_bin[bin].push_back(it); + } + + size_t bin_id = bm.find_for_request(n_chunks); + if (bin_id == SIZE_MAX) + return {0, 0}; + + auto& entries = by_bin[bin_id]; + auto best_it = entries[0]; + for (size_t i = 1; i < entries.size(); i++) + { + if (entries[i]->addr < best_it->addr) + best_it = entries[i]; + } + + OracleRange block = *best_it; + ranges.erase(best_it); + + // Carve using base-offset address. + auto carved = + Bins::carve({base_offset + block.addr, block.size}, n_chunks); + if (carved.pre.size != 0) + ranges.insert({carved.pre.base - base_offset, carved.pre.size}); + if (carved.post.size != 0) + ranges.insert({carved.post.base - base_offset, carved.post.size}); + + return {carved.req.base - base_offset, carved.req.size}; + } + + bool empty() const + { + return ranges.empty(); + } + + size_t count() const + { + return ranges.size(); + } + }; + + template + static void test_stress_seed(size_t seed, size_t num_ops) + { + reset_mock_store(); + TestArena arena; + + constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); + // Offset all chunk addresses to avoid address 0 (tree null). + constexpr size_t BASE = ARENA_CHUNKS; + Oracle oracle(BASE); + // Track which chunks are allocated (not free). + std::vector allocated(ARENA_CHUNKS, true); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + bool do_add = (rng.next() % 3) != 0; // Bias towards adding. + + if (do_add) + { + // Find a free address range of random size within the arena. + size_t max_size = ARENA_CHUNKS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % ARENA_CHUNKS; + + // Adjust: find a contiguous allocated (not free) region. + // We need a region that's currently allocated (not in the + // free set) to add back. + bool found = false; + for (size_t try_start = start; try_start < ARENA_CHUNKS; try_start++) + { + // Check if [try_start, try_start + size) is all allocated. + size_t actual_size = 0; + for (size_t j = try_start; j < ARENA_CHUNKS && j < try_start + size; + j++) + { + if (!allocated[j]) + break; + actual_size++; + } + + if (actual_size >= 1) + { + size = actual_size; + start = try_start; + found = true; + break; + } + } + + if (!found) + continue; + + // Clamp to arena size limit. + if (size >= ARENA_CHUNKS) + size = ARENA_CHUNKS - 1; + if (start + size > ARENA_CHUNKS) + size = ARENA_CHUNKS - start; + if (size == 0) + continue; + + // Mark as free. + SNMALLOC_ASSERT(start + size <= ARENA_CHUNKS); + for (size_t j = start; j < start + size; j++) + allocated[j] = false; + + auto result = arena.add_block(chunk_addr(BASE + start), size); + oracle.add(start, size); + + if (result.first != 0) + { + // Overflow — all chunks are now free and returned to caller. + // Oracle should be empty after we remove the overflow range. + // Reset: mark everything as allocated again, clear oracle. + for (size_t j = 0; j < ARENA_CHUNKS; j++) + allocated[j] = true; + oracle = Oracle(BASE); + // The overflow range isn't tracked by the arena anymore. + } + + arena.check_invariant(true); + } + else + { + // Remove. + size_t max_req = ARENA_CHUNKS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_result = arena.remove_block(n); + auto oracle_result = oracle.remove(n); + UNUSED(arena_result); + + // Both should agree on success/failure. + // Use size == 0 to detect failure, since oracle address 0 is valid. + if (oracle_result.second == 0) + { + SNMALLOC_ASSERT(arena_result.second == 0); + } + else + { + SNMALLOC_ASSERT(arena_result.second != 0); + // Both should return the same address and size. + SNMALLOC_ASSERT( + arena_result.first == chunk_addr(BASE + oracle_result.first)); + SNMALLOC_ASSERT(arena_result.second == oracle_result.second); + + // Mark as allocated. + size_t start = oracle_result.first; + SNMALLOC_ASSERT(start + oracle_result.second <= ARENA_CHUNKS); + for (size_t j = start; j < start + oracle_result.second; j++) + allocated[j] = true; + } + + arena.check_invariant(true); + } + } + } + + static void test_stress() + { + constexpr size_t K = 6; // 64-chunk arena + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 50; + + for (size_t seed = 1; seed <= NUM_SEEDS; seed++) + { + test_stress_seed(seed, NUM_OPS); + } + printf( + " Randomised stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS); + } + + // ================================================================== + // (I) Multi-instance: shared pagemap, blocks migrating between arenas + // ================================================================== + + static void test_multi_instance_basic() + { + reset_mock_store(); + TestArena<8> arena_a; + TestArena<8> arena_b; + constexpr size_t BASE = 256; // avoid address 0 + + // Add distinct blocks to each arena. + arena_a.add_block(chunk_addr(BASE + 10), 5); + arena_b.add_block(chunk_addr(BASE + 30), 5); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + // Migrate a block from A to B. + auto [a_addr, a_size] = arena_a.remove_block(3); + SNMALLOC_ASSERT(a_addr != 0 && a_size != 0); + arena_a.check_invariant(true); + + arena_b.add_block(a_addr, a_size); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + // Migrate from B back to A. + auto [b_addr, b_size] = arena_b.remove_block(2); + SNMALLOC_ASSERT(b_addr != 0 && b_size != 0); + arena_b.check_invariant(true); + + arena_a.add_block(b_addr, b_size); + arena_a.check_invariant(true); + arena_b.check_invariant(true); + + printf(" Basic migration: OK\n"); + } + + static void test_multi_instance_consolidation() + { + reset_mock_store(); + TestArena<8> arena_a; + TestArena<8> arena_b; + constexpr size_t BASE = 256; + + // Arena B holds two blocks with a gap: [20..24) and [28..32). + arena_b.add_block(chunk_addr(BASE + 20), 4); + arena_b.add_block(chunk_addr(BASE + 28), 4); + arena_b.check_invariant(true); + + // Arena A holds the gap: [24..28). + arena_a.add_block(chunk_addr(BASE + 24), 4); + arena_a.check_invariant(true); + + // Migrate the gap from A to B → should consolidate into [20..32). + auto [addr, size] = arena_a.remove_block(4); + SNMALLOC_ASSERT(addr == chunk_addr(BASE + 24)); + SNMALLOC_ASSERT(size == 4); + arena_a.check_invariant(true); + + arena_b.add_block(addr, size); + arena_b.check_invariant(true); + + // B should now serve a size-12 request from the consolidated block. + auto [r_addr, r_size] = arena_b.remove_block(12); + SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20)); + SNMALLOC_ASSERT(r_size == 12); + arena_b.check_invariant(true); + + printf(" Consolidation after migration: OK\n"); + } + + template + static void test_multi_stress_seed(size_t seed, size_t num_ops) + { + reset_mock_store(); + TestArena arena_a; + TestArena arena_b; + + constexpr size_t ARENA_CHUNKS = bits::one_at_bit(K); + constexpr size_t BASE = ARENA_CHUNKS; + Oracle oracle_a(BASE); + Oracle oracle_b(BASE); + + // 0 = not in any arena, 1 = in arena A, 2 = in arena B. + std::vector owner(ARENA_CHUNKS, 0); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + // 0,1 = add to A or B; 2,3 = remove from A or B; 4 = migrate. + size_t action = rng.next() % 5; + + bool target_a = (action & 1) == 0; + auto& arena = target_a ? arena_a : arena_b; + auto& oracle = target_a ? oracle_a : oracle_b; + uint8_t my_id = target_a ? 1 : 2; + + if (action <= 1) + { + // Add: find a contiguous unowned region to free into this arena. + size_t max_size = ARENA_CHUNKS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % ARENA_CHUNKS; + + bool found = false; + for (size_t s = start; s < ARENA_CHUNKS; s++) + { + size_t actual = 0; + for (size_t j = s; j < ARENA_CHUNKS && j < s + size; j++) + { + if (owner[j] != 0) + break; + actual++; + } + if (actual >= 1) + { + size = actual; + start = s; + found = true; + break; + } + } + if (!found) + continue; + + if (size >= ARENA_CHUNKS) + size = ARENA_CHUNKS - 1; + if (start + size > ARENA_CHUNKS) + size = ARENA_CHUNKS - start; + if (size == 0) + continue; + + for (size_t j = start; j < start + size; j++) + owner[j] = my_id; + + auto result = arena.add_block(chunk_addr(BASE + start), size); + oracle.add(start, size); + + if (result.first != 0) + { + for (size_t j = 0; j < ARENA_CHUNKS; j++) + if (owner[j] == my_id) + owner[j] = 0; + oracle = Oracle(BASE); + } + + arena.check_invariant(true); + } + else if (action <= 3) + { + // Remove from this arena. + size_t max_req = ARENA_CHUNKS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_r = arena.remove_block(n); + auto oracle_r = oracle.remove(n); + UNUSED(arena_r); + + if (oracle_r.second == 0) + { + SNMALLOC_ASSERT(arena_r.second == 0); + } + else + { + SNMALLOC_ASSERT(arena_r.second != 0); + SNMALLOC_ASSERT(arena_r.first == chunk_addr(BASE + oracle_r.first)); + SNMALLOC_ASSERT(arena_r.second == oracle_r.second); + + for (size_t j = oracle_r.first; j < oracle_r.first + oracle_r.second; + j++) + { + SNMALLOC_ASSERT(owner[j] == my_id); + owner[j] = 0; + } + } + + arena.check_invariant(true); + } + else + { + // Migrate: remove from one arena, add to the other. + bool from_a = (rng.next() & 1) == 0; + auto& src = from_a ? arena_a : arena_b; + auto& src_oracle = from_a ? oracle_a : oracle_b; + auto& dst = from_a ? arena_b : arena_a; + auto& dst_oracle = from_a ? oracle_b : oracle_a; + uint8_t src_id = from_a ? 1 : 2; + uint8_t dst_id = from_a ? 2 : 1; + UNUSED(src_id); + + size_t n = (rng.next() % 3) + 1; + auto src_r = src.remove_block(n); + auto src_or = src_oracle.remove(n); + + if (src_or.second == 0) + { + SNMALLOC_ASSERT(src_r.second == 0); + } + else + { + SNMALLOC_ASSERT(src_r.second != 0); + SNMALLOC_ASSERT(src_r.first == chunk_addr(BASE + src_or.first)); + SNMALLOC_ASSERT(src_r.second == src_or.second); + + for (size_t j = src_or.first; j < src_or.first + src_or.second; j++) + { + SNMALLOC_ASSERT(owner[j] == src_id); + owner[j] = dst_id; + } + + auto dst_r = dst.add_block(src_r.first, src_r.second); + dst_oracle.add(src_or.first, src_or.second); + + if (dst_r.first != 0) + { + for (size_t j = 0; j < ARENA_CHUNKS; j++) + if (owner[j] == dst_id) + owner[j] = 0; + dst_oracle = Oracle(BASE); + } + } + + src.check_invariant(true); + dst.check_invariant(true); + } + } + } + + static void test_multi_stress() + { + constexpr size_t K = 6; // 64-chunk arena + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 50; + + for (size_t seed = 1; seed <= NUM_SEEDS; seed++) + test_multi_stress_seed(seed, NUM_OPS); + + printf( + " Multi-instance stress (%zu seeds x %zu ops): OK\n", + NUM_SEEDS, + NUM_OPS); + } + +} // namespace snmalloc + +int main() +{ + printf("--- Arena tests ---\n"); + + printf("(A) Accessor round-trips:\n"); + snmalloc::test_variant_roundtrip(); + snmalloc::test_large_size_roundtrip(); + snmalloc::test_word_roundtrip(); + + printf("(B) RBTree smoke via arena:\n"); + snmalloc::test_rbtree_smoke_via_arena(); + + printf("(C) Empty-state invariant:\n"); + snmalloc::test_empty_invariant<4>(); + snmalloc::test_empty_invariant<5>(); + snmalloc::test_empty_invariant<6>(); + + printf("(D) add_block without consolidation:\n"); + snmalloc::test_add_no_consolidation(); + + printf("(E) remove_block:\n"); + snmalloc::test_remove_exact(); + snmalloc::test_remove_carving(); + + printf("(F) Consolidation case matrix:\n"); + snmalloc::test_consolidation_p_min(); + snmalloc::test_consolidation_p_nonmin(); + snmalloc::test_consolidation_s_min(); + snmalloc::test_consolidation_s_nonmin(); + snmalloc::test_consolidation_ps_both_min(); + snmalloc::test_consolidation_ps_p_min_s_nonmin(); + snmalloc::test_consolidation_ps_p_nonmin_s_min(); + snmalloc::test_consolidation_ps_both_nonmin(); + + printf("(F2) OddTwo (unaligned size-2):\n"); + snmalloc::test_oddtwo_variant(); + snmalloc::test_oddtwo_contains_min_filter(); + snmalloc::test_oddtwo_consolidation(); + snmalloc::test_oddtwo_consolidation_pred(); + snmalloc::test_oddtwo_remove_carve(); + + printf("(G) Overflow:\n"); + snmalloc::test_overflow(); + snmalloc::test_overflow_precise(); + + printf("(H) Randomised stress:\n"); + snmalloc::test_stress(); + + printf("(I) Multi-instance:\n"); + snmalloc::test_multi_instance_basic(); + snmalloc::test_multi_instance_consolidation(); + snmalloc::test_multi_stress(); + + printf("All Arena tests passed.\n"); + return 0; +} From b6c50a04574627bbdd9b347e93efeaaff3a167f0 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 22 May 2026 09:11:48 +0100 Subject: [PATCH 06/15] Arena: representation-agnostic Rep concept + boundary support - Make Arena fully generic over its Rep, mirroring the Buddy/Rep layering. The class no longer holds any bit-layout constants; Rep supplies the full RBTree Rep for both the bin trees and the range tree, owning red-bit (and any tag-bit) packing privately. - Rep concept now requires: using BinRep -- full RBTree Rep for the bin trees using RangeRep -- full RBTree Rep for the range tree get_variant / set_variant get_large_size_chunks / set_large_size_chunks can_consolidate(higher_addr) -> bool - Add can_consolidate checks in add_block before each (predecessor and successor) merge, and update the invariants to tolerate boundary-blocked adjacency. - MockRep grows inner BinRep / RangeRep structs that each provide the full RBTree Rep interface over the mock-entry array, with a private red-bit at bit 8. - New tests verify that can_consolidate returning false at a specific address prevents predecessor- and successor-side merges independently, including at min-block boundaries. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/arena.h | 181 ++++------------ src/snmalloc/mem/metadata.h | 7 +- src/test/func/arena/arena.cc | 306 +++++++++++++++++++++++++-- 3 files changed, 337 insertions(+), 157 deletions(-) diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h index 0047b8d10..37df49e87 100644 --- a/src/snmalloc/backend_helpers/arena.h +++ b/src/snmalloc/backend_helpers/arena.h @@ -31,18 +31,30 @@ namespace snmalloc /** * Manages free ranges within a single bounded arena using a dual-tree - * scheme (bin trees for allocation, range tree for consolidation). + * scheme: a set of bin trees indexed by the floor-log2 size class + * (used for allocation lookup) and one range tree keyed by address + * (used for consolidation of adjacent free ranges). * - * `Rep` provides word-level pagemap access: - * - `ref_word(direction, addr) -> uintptr_t*`: bin-tree child slot - * (left/right pointer in the first pagemap entry). - * - `ref_range_word(direction, addr) -> uintptr_t*`: range-tree - * child slot (left/right pointer in the second pagemap entry). - * - `get_variant(addr)` / `set_variant(addr, v)` + * `Rep` is the representation. It owns *all* storage and bit-layout + * decisions for tree nodes and per-block metadata. `Rep` must provide: + * + * - `using BinRep` — full RBTree Rep for the bin trees, supplying + * `Handle`, `Contents`, `null`, `root`, `ref`, `get`, `set`, + * `is_red`, `set_red`, `compare`, `equal`, `printable`, `name`. + * Owns its own red-bit packing privately. + * - `using RangeRep` — full RBTree Rep for the range tree, same + * shape as `BinRep`. + * - `get_variant(addr)` / `set_variant(addr, v)` — the + * `ArenaVariant` tag for the block starting at `addr`. * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` + * — exact chunk count for `Large` blocks (3+ chunks). + * - `can_consolidate(higher_addr) -> bool` — whether the block at + * `higher_addr` may be merged with the block immediately below + * it. Returns false at allocation boundaries that must be + * preserved. * - * `MIN_CHUNKS_BITS`: log2 of minimum allocation unit in chunks (0 for - * this phase — 1-chunk minimum). + * `MIN_CHUNKS_BITS`: log2 of the minimum allocation unit in chunks + * (currently only 0 is supported — 1-chunk minimum). * * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that * reach this size overflow and are returned to the caller. @@ -60,123 +72,8 @@ namespace snmalloc static_assert( bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); - // Bit layout constants. - static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; - static constexpr uintptr_t VARIANT_MASK = uintptr_t(0x3) << 9; - static constexpr uintptr_t META_MASK = RED_BIT | VARIANT_MASK; - static constexpr uintptr_t BACKEND_RESERVED_MASK = 0xFF; - - static_assert((META_MASK & BACKEND_RESERVED_MASK) == 0); - static_assert(META_MASK < MIN_CHUNK_SIZE); - - // ---- Handle: thin proxy around uintptr_t* ---- - // - // Matches BackendStateWordRef's interface: wraps a pointer to a - // word slot (tree root field or pagemap word). Constructed from - // &root or from Rep::ref_word / Rep::ref_range_word. - struct WordRef - { - uintptr_t* val{nullptr}; - - constexpr WordRef() = default; - - constexpr WordRef(uintptr_t* p) : val(p) {} - - uintptr_t get() const - { - return *val; - } - - WordRef& operator=(uintptr_t v) - { - *val = v; - return *this; - } - - bool operator!=(const WordRef& other) const - { - return val != other.val; - } - - uintptr_t printable_address() const - { - return reinterpret_cast(val); - } - }; - - // ---- TreeRep: RBTree Rep parameterised on which word accessor to use ---- - // - // `RefFn` selects the pagemap entry: ref_word for the bin tree, - // ref_range_word for the range tree. - template - struct TreeRep - { - using Handle = WordRef; - using Contents = uintptr_t; - - static constexpr Contents null = 0; - static constexpr Contents root = 0; - - static Contents get(Handle h) - { - return h.get() & ~META_MASK; - } - - static void set(Handle h, Contents v) - { - h = v | (h.get() & META_MASK); - } - - static Handle ref(bool direction, Contents k) - { - static const Contents null_entry = 0; - if (SNMALLOC_UNLIKELY(k == 0)) - return Handle{const_cast(&null_entry)}; - return Handle{RefFn(direction, k)}; - } - - static bool is_red(Contents k) - { - return (ref(true, k).get() & RED_BIT) == RED_BIT; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto h = ref(true, k); - h = h.get() ^ RED_BIT; - } - } - - static bool compare(Contents k1, Contents k2) - { - return k1 > k2; - } - - static bool equal(Contents k1, Contents k2) - { - return k1 == k2; - } - - static uintptr_t printable(Contents k) - { - return k; - } - - static uintptr_t printable(Handle h) - { - return h.printable_address(); - } - - static const char* name() - { - return "TreeRep"; - } - }; - - using BinRep = TreeRep; - using RangeRep = TreeRep; + using BinRep = typename Rep::BinRep; + using RangeRep = typename Rep::RangeRep; using BinTree = RBTree; using RangeTree = RBTree; @@ -299,17 +196,21 @@ namespace snmalloc // Predecessor: check range tree, then fall back to min-size bin. auto [pa, ps] = range_from_addr(p_key); - if (pa + ps * MIN_CHUNK_SIZE == addr) + if (pa + ps * MIN_CHUNK_SIZE == addr && Rep::can_consolidate(addr)) merge(pa, ps); - else if (addr >= MIN_CHUNK_SIZE && contains_min(addr - MIN_CHUNK_SIZE)) + else if ( + addr >= MIN_CHUNK_SIZE && Rep::can_consolidate(addr) && + contains_min(addr - MIN_CHUNK_SIZE)) merge(addr - MIN_CHUNK_SIZE, 1); // Successor: check range tree, then fall back to min-size bin. auto [sa, ss] = range_from_addr(s_key); uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; - if (sa == succ_addr) + if (sa == succ_addr && Rep::can_consolidate(succ_addr)) merge(sa, ss); - else if (succ_addr > addr && contains_min(succ_addr)) + else if ( + succ_addr > addr && Rep::can_consolidate(succ_addr) && + contains_min(succ_addr)) merge(succ_addr, 1); // Arena-scale overflow: consolidated block spans the full arena. @@ -399,7 +300,7 @@ namespace snmalloc if (!enabled) return; - // 1a. No two adjacent non-min blocks. + // 1a. No two adjacent non-min blocks (unless boundary prevents merge). { uintptr_t prev_addr = 0; size_t prev_size = 0; @@ -407,22 +308,27 @@ namespace snmalloc range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (prev_valid) - SNMALLOC_ASSERT(prev_addr + prev_size * MIN_CHUNK_SIZE != a); + { + uintptr_t prev_end = prev_addr + prev_size * MIN_CHUNK_SIZE; + SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); + } prev_addr = a; prev_size = s; prev_valid = true; }); } - // 1b. No non-min block adjacent to a min block. + // 1b. No non-min block adjacent to a min block (unless boundary). range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (a >= MIN_CHUNK_SIZE) - SNMALLOC_ASSERT(!contains_min(a - MIN_CHUNK_SIZE)); - SNMALLOC_ASSERT(!contains_min(a + s * MIN_CHUNK_SIZE)); + SNMALLOC_ASSERT( + !contains_min(a - MIN_CHUNK_SIZE) || !Rep::can_consolidate(a)); + uintptr_t end = a + s * MIN_CHUNK_SIZE; + SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); }); - // 1c. No two adjacent min blocks. + // 1c. No two adjacent min blocks (unless boundary). { uintptr_t prev = 0; bool prev_valid = false; @@ -430,7 +336,8 @@ namespace snmalloc if (Rep::get_variant(node) != ArenaVariant::Min) return; if (prev_valid) - SNMALLOC_ASSERT(prev + MIN_CHUNK_SIZE != node); + SNMALLOC_ASSERT( + prev + MIN_CHUNK_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; }); diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index e753f125c..c6d29793e 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -158,10 +158,15 @@ namespace snmalloc /** * Explicit assignment operator, copies the data preserving the boundary bit * in the target if it is set. + * + * Load-bearing: the pagemap writes back through this operator (its + * `set(p, t)` is `body[p >> SHIFT] = t`), so the boundary bit set + * once at OS-range registration survives every subsequent metadata + * mutation — including chunk reuse via `dealloc_chunk` — without + * any consolidation path having to touch it explicitly. */ MetaEntryBase& operator=(const MetaEntryBase& other) { - // Don't overwrite the boundary bit with the other's meta = (other.meta & ~META_BOUNDARY_BIT) | address_cast(meta & META_BOUNDARY_BIT); remote_and_sizeclass = other.remote_and_sizeclass; diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 2c746b4b0..1ae4cd738 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -27,6 +27,41 @@ namespace snmalloc { // ---- MockRep: array-backed storage for testing ---- + /** + * Thin proxy around uintptr_t* with the same interface as + * BackendStateWordRef (get, operator=, operator!=). Used by MockRep + * to avoid requiring a real pagemap in unit tests. + */ + struct ArenaWordRef + { + uintptr_t* val{nullptr}; + + constexpr ArenaWordRef() = default; + + constexpr ArenaWordRef(uintptr_t* p) : val(p) {} + + uintptr_t get() const + { + return *val; + } + + ArenaWordRef& operator=(uintptr_t v) + { + *val = v; + return *this; + } + + bool operator!=(const ArenaWordRef& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold // range-tree children. variant and large_size_chunks hold metadata. @@ -58,28 +93,97 @@ namespace snmalloc return idx; } - struct MockRep + // Inner RBTree Rep used by both MockRep::BinRep and MockRep::RangeRep. + // Tag selects which pair of fields in mock_entry holds the tree pointers. + // The red bit is packed into bit 8 of the stored word (matching the + // production PagemapRep layout, but defined privately here). + template + struct MockTreeRep { - static ArenaVariant get_variant(uintptr_t addr) + using Handle = ArenaWordRef; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static_assert(RED_BIT < MIN_CHUNK_SIZE); + + static Handle ref(bool direction, Contents k) { - return mock_store[mock_index(addr)].variant; + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + auto& e = mock_store[mock_index(k)]; + if constexpr (IsRange) + return Handle{direction ? &e.range_word1 : &e.range_word2}; + else + return Handle{direction ? &e.word1 : &e.word2}; } - static void set_variant(uintptr_t addr, ArenaVariant v) + static Contents get(Handle h) { - mock_store[mock_index(addr)].variant = v; + return h.get() & ~RED_BIT; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & RED_BIT); + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; } - static uintptr_t* ref_word(bool direction, uintptr_t addr) + static uintptr_t printable(Contents k) { - auto& e = mock_store[mock_index(addr)]; - return direction ? &e.word1 : &e.word2; + return k; } - static uintptr_t* ref_range_word(bool direction, uintptr_t addr) + static uintptr_t printable(Handle h) { - auto& e = mock_store[mock_index(addr)]; - return direction ? &e.range_word1 : &e.range_word2; + return h.printable_address(); + } + + static const char* name() + { + return IsRange ? "MockRangeRep" : "MockBinRep"; + } + }; + + struct MockRep + { + using BinRep = MockTreeRep; + using RangeRep = MockTreeRep; + + static ArenaVariant get_variant(uintptr_t addr) + { + return mock_store[mock_index(addr)].variant; + } + + static void set_variant(uintptr_t addr, ArenaVariant v) + { + mock_store[mock_index(addr)].variant = v; } static size_t get_large_size_chunks(uintptr_t addr) @@ -91,6 +195,11 @@ namespace snmalloc { mock_store[mock_index(addr)].large_size_chunks = s; } + + static bool can_consolidate(uintptr_t) + { + return true; + } }; // ---- Test access ---- @@ -176,15 +285,19 @@ namespace snmalloc uintptr_t v1 = chunk_addr(10); uintptr_t v2 = chunk_addr(20); - *MockRep::ref_word(true, a) = v1; - *MockRep::ref_word(false, a) = v2; - SNMALLOC_ASSERT(*MockRep::ref_word(true, a) == v1); - SNMALLOC_ASSERT(*MockRep::ref_word(false, a) == v2); + auto w1 = MockRep::BinRep::ref(true, a); + auto w2 = MockRep::BinRep::ref(false, a); + w1 = v1; + w2 = v2; + SNMALLOC_ASSERT(MockRep::BinRep::ref(true, a).get() == v1); + SNMALLOC_ASSERT(MockRep::BinRep::ref(false, a).get() == v2); - *MockRep::ref_range_word(true, a) = v2; - *MockRep::ref_range_word(false, a) = v1; - SNMALLOC_ASSERT(*MockRep::ref_range_word(true, a) == v2); - SNMALLOC_ASSERT(*MockRep::ref_range_word(false, a) == v1); + auto rw1 = MockRep::RangeRep::ref(true, a); + auto rw2 = MockRep::RangeRep::ref(false, a); + rw1 = v2; + rw2 = v1; + SNMALLOC_ASSERT(MockRep::RangeRep::ref(true, a).get() == v2); + SNMALLOC_ASSERT(MockRep::RangeRep::ref(false, a).get() == v1); printf(" Word round-trip: OK\n"); } @@ -1223,6 +1336,155 @@ namespace snmalloc NUM_OPS); } + // ================================================================== + // (J) Boundary consolidation prevention + // ================================================================== + + // A Rep variant that blocks consolidation at specific addresses. + static std::set boundary_addrs; + + struct BoundaryMockRep + { + using BinRep = MockRep::BinRep; + using RangeRep = MockRep::RangeRep; + + static ArenaVariant get_variant(uintptr_t addr) + { + return MockRep::get_variant(addr); + } + + static void set_variant(uintptr_t addr, ArenaVariant v) + { + MockRep::set_variant(addr, v); + } + + static size_t get_large_size_chunks(uintptr_t addr) + { + return MockRep::get_large_size_chunks(addr); + } + + static void set_large_size_chunks(uintptr_t addr, size_t s) + { + MockRep::set_large_size_chunks(addr, s); + } + + static bool can_consolidate(uintptr_t higher_addr) + { + return boundary_addrs.find(higher_addr) == boundary_addrs.end(); + } + }; + + template + using BoundaryArena = Arena; + + // Test: predecessor merge blocked by boundary. + static void test_boundary_blocks_predecessor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t p_addr = chunk_addr(2); + uintptr_t a_addr = chunk_addr(4); + + // Place a boundary at a_addr — blocks should not consolidate leftward. + boundary_addrs.insert(a_addr); + + arena.add_block(p_addr, 2); + arena.add_block(a_addr, 2); + + // P (chunks 2-3) and A (chunks 4-5) are adjacent but the boundary + // at a_addr prevents merging. Both should remain separate. + auto [r1_addr, r1_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r1_addr == p_addr && r1_size == 2); + auto [r2_addr, r2_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r2_addr == a_addr && r2_size == 2); + + printf(" Boundary blocks predecessor merge: OK\n"); + } + + // Test: successor merge blocked by boundary. + static void test_boundary_blocks_successor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t a_addr = chunk_addr(2); + uintptr_t s_addr = chunk_addr(4); + + // Place a boundary at s_addr — blocks should not consolidate rightward. + boundary_addrs.insert(s_addr); + + arena.add_block(s_addr, 4); + arena.add_block(a_addr, 2); + + // A (chunks 2-3) and S (chunks 4-7) are adjacent but the boundary + // at s_addr prevents merging. Both should remain separate. + auto [r1_addr, r1_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r1_addr == a_addr && r1_size == 2); + auto [r2_addr, r2_size] = arena.remove_block(4); + SNMALLOC_ASSERT(r2_addr == s_addr && r2_size == 4); + + printf(" Boundary blocks successor merge: OK\n"); + } + + // Test: boundary only blocks the specific merge; other merges proceed. + static void test_boundary_partial() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + // Three adjacent blocks: chunks [4,6), [6,8), [8,10). + // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows + // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. + boundary_addrs.insert(chunk_addr(8)); + + arena.add_block(chunk_addr(4), 2); + arena.add_block(chunk_addr(8), 2); + arena.add_block(chunk_addr(6), 2); + + // [4,6) and [6,8) should consolidate to [4,8). + // [8,10) should remain separate due to boundary. + auto [r1_addr, r1_size] = arena.remove_block(4); + SNMALLOC_ASSERT(r1_addr == chunk_addr(4) && r1_size == 4); + auto [r2_addr, r2_size] = arena.remove_block(2); + SNMALLOC_ASSERT(r2_addr == chunk_addr(8) && r2_size == 2); + + printf(" Boundary partial (P merges, S blocked): OK\n"); + } + + // Test: min-size predecessor blocked by boundary. + static void test_boundary_blocks_min_predecessor() + { + reset_mock_store(); + boundary_addrs.clear(); + constexpr size_t K = 6; + BoundaryArena arena; + + uintptr_t p_addr = chunk_addr(4); + uintptr_t a_addr = chunk_addr(5); + + boundary_addrs.insert(a_addr); + + arena.add_block(p_addr, 1); // min-size block + arena.add_block(a_addr, 1); // adjacent, but boundary prevents merge + + auto [r1_addr, r1_size] = arena.remove_block(1); + auto [r2_addr, r2_size] = arena.remove_block(1); + // Both should be separate min-size blocks. + SNMALLOC_ASSERT(r1_size == 1 && r2_size == 1); + SNMALLOC_ASSERT( + (r1_addr == p_addr && r2_addr == a_addr) || + (r1_addr == a_addr && r2_addr == p_addr)); + + printf(" Boundary blocks min predecessor merge: OK\n"); + } + } // namespace snmalloc int main() @@ -1278,6 +1540,12 @@ int main() snmalloc::test_multi_instance_consolidation(); snmalloc::test_multi_stress(); + printf("(J) Boundary consolidation:\n"); + snmalloc::test_boundary_blocks_predecessor(); + snmalloc::test_boundary_blocks_successor(); + snmalloc::test_boundary_partial(); + snmalloc::test_boundary_blocks_min_predecessor(); + printf("All Arena tests passed.\n"); return 0; } From 45802042e10974f7b1a63c53e891020c1fb6ff9f Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 22 May 2026 15:56:17 +0100 Subject: [PATCH 07/15] Add LargeArenaRange and generalise Arena to byte units Adds the LargeArenaRange wrapper that drops into the LargeBuddyRange slot, generalises Arena and ArenaBins on MIN_SIZE_BITS, and converts the arena/range API boundary to bytes throughout. * LargeArenaRange with a PagemapRep that packs variant tag, RB red bit and the consolidated large-block size into the first pagemap word, and uses the second word for in-tree links. Provides alloc_range / dealloc_range / add_range over the bin-tree arena. * parent_dealloc unifies the old parent_dealloc_range and dealloc_overflow paths; add_range uses bits::align_up / bits::align_down for parent-input trimming. * ArenaBins generalises the bin scheme so its range_t, carve and find_for_request all speak bytes (multiples of UNIT_SIZE = 1 << MIN_SIZE_BITS). Tests cover MIN_SIZE_BITS in {0, 4, 14}. * Arena: add_block / remove_block / variant_of / insert_block / range_from_addr / invariants all work in bytes. remove_block returns a scalar address (0 = failure); the size half of the old pair was tautological. CHUNKS_BITS / addr_to_chunk / chunk_to_addr removed. * PagemapRep::get_large_size / set_large_size are bytes-in / bytes-out; storage still scales by MIN_SIZE_BITS so the shifted field fits a pagemap word. * Tests: func-largearenarange exercises alloc/dealloc/refill/large paths against a mock parent; func-arena and func-arenabins updated for the bytes-throughout convention (chunk_size(N) helper at the test boundary). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 2 +- src/snmalloc/backend_helpers/arena.h | 190 ++++---- src/snmalloc/backend_helpers/arenabins.h | 222 +++++---- .../backend_helpers/backend_helpers.h | 1 + .../backend_helpers/largearenarange.h | 380 +++++++++++++++ src/test/func/arena/arena.cc | 331 +++++++------ src/test/func/arenabins/arenabins.cc | 460 ++++++++++++------ .../func/largearenarange/largearenarange.cc | 316 ++++++++++++ 8 files changed, 1413 insertions(+), 489 deletions(-) create mode 100644 src/snmalloc/backend_helpers/largearenarange.h create mode 100644 src/test/func/largearenarange/largearenarange.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a11c6182c..bbe6eeabc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS - aligned_dealloc arena arenabins + aligned_dealloc arena arenabins largearenarange bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h index 37df49e87..700c1d900 100644 --- a/src/snmalloc/backend_helpers/arena.h +++ b/src/snmalloc/backend_helpers/arena.h @@ -46,31 +46,36 @@ namespace snmalloc * shape as `BinRep`. * - `get_variant(addr)` / `set_variant(addr, v)` — the * `ArenaVariant` tag for the block starting at `addr`. - * - `get_large_size_chunks(addr)` / `set_large_size_chunks(addr, n)` - * — exact chunk count for `Large` blocks (3+ chunks). + * - `get_large_size(addr)` / `set_large_size(addr, size)` — + * exact byte size for `Large` blocks (3+ units). * - `can_consolidate(higher_addr) -> bool` — whether the block at * `higher_addr` may be merged with the block immediately below * it. Returns false at allocation boundaries that must be * preserved. * - * `MIN_CHUNKS_BITS`: log2 of the minimum allocation unit in chunks - * (currently only 0 is supported — 1-chunk minimum). + * `MIN_SIZE_BITS`: log2 of the unit of allocation (= the minimum + * block size in bytes). All addresses and sizes managed by this + * arena are multiples of `1 << MIN_SIZE_BITS`. * - * `MAX_CHUNKS_BITS`: log2 of the arena size in chunks. Blocks that - * reach this size overflow and are returned to the caller. + * `MAX_SIZE_BITS`: log2 of the (exclusive) upper bound on managed + * block sizes. Blocks that reach this size overflow and are + * returned to the caller. */ - template + template class Arena { - static_assert(MIN_CHUNKS_BITS == 0, "Only MIN_CHUNKS_BITS == 0 supported"); - static_assert(MAX_CHUNKS_BITS > MIN_CHUNKS_BITS); - static_assert(MAX_CHUNKS_BITS < bits::BITS); + static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); + static_assert(MAX_SIZE_BITS < bits::BITS); + static_assert(MIN_SIZE_BITS < bits::BITS); + + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS; + static constexpr size_t TWO_UNITS = size_t(2) << MIN_SIZE_BITS; static constexpr size_t B = 2; - using Bins = ArenaBins; + using Bins = ArenaBins; static_assert( - bits::one_at_bit(MAX_CHUNKS_BITS) - 1 <= Bins::max_supported_chunks()); + bits::one_at_bit(MAX_SIZE_BITS) - 1 <= Bins::max_supported_size()); using BinRep = typename Rep::BinRep; using RangeRep = typename Rep::RangeRep; @@ -82,28 +87,15 @@ namespace snmalloc RangeTree range_tree{}; typename Bins::Bitmap bitmap{}; - // ---- Address-unit helpers ---- - - static size_t addr_to_chunk(uintptr_t a) - { - return a >> MIN_CHUNK_BITS; - } - - static uintptr_t chunk_to_addr(size_t c) - { - return static_cast(c) << MIN_CHUNK_BITS; - } - // ---- Metadata helpers ---- - static ArenaVariant - variant_of(size_t size_chunks, size_t chunk_index) + static ArenaVariant variant_of(size_t size, uintptr_t addr) { - if (size_chunks == 1) + if (size == UNIT_SIZE) return ArenaVariant::Min; - if (size_chunks == 2) - return (chunk_index & 1) == 0 ? ArenaVariant::EvenTwo : - ArenaVariant::OddTwo; + if (size == TWO_UNITS) + return ((addr >> MIN_SIZE_BITS) & 1) == 0 ? ArenaVariant::EvenTwo : + ArenaVariant::OddTwo; return ArenaVariant::Large; } @@ -115,12 +107,18 @@ namespace snmalloc switch (v) { case ArenaVariant::Min: - return {a, 1}; + return {a, UNIT_SIZE}; case ArenaVariant::EvenTwo: case ArenaVariant::OddTwo: - return {a, 2}; + return {a, TWO_UNITS}; case ArenaVariant::Large: - return {a, Rep::get_large_size_chunks(a)}; + { + size_t s = Rep::get_large_size(a); + SNMALLOC_ASSERT( + s > TWO_UNITS && s < bits::one_at_bit(MAX_SIZE_BITS) && + bits::align_down(s, UNIT_SIZE) == s); + return {a, s}; + } } SNMALLOC_ASSERT(false); return {0, 0}; @@ -133,27 +131,25 @@ namespace snmalloc Rep::get_variant(a) == ArenaVariant::Min; } - void insert_block(uintptr_t addr, size_t size_chunks) + void insert_block(uintptr_t addr, size_t size) { - Rep::set_variant(addr, variant_of(size_chunks, addr_to_chunk(addr))); - if (size_chunks >= 3) - Rep::set_large_size_chunks(addr, size_chunks); + Rep::set_variant(addr, variant_of(size, addr)); + if (size > TWO_UNITS) + Rep::set_large_size(addr, size); - auto chunk_range = - typename Bins::range_t{addr_to_chunk(addr), size_chunks}; - size_t bin = bitmap.add(chunk_range); + auto range = typename Bins::range_t{addr, size}; + size_t bin = bitmap.add(range); bin_trees[bin].insert_elem(addr); - if (size_chunks >= 2) + if (size >= TWO_UNITS) range_tree.insert_elem(addr); } - void unlink_block(uintptr_t addr, size_t size_chunks) + void unlink_block(uintptr_t addr, size_t size) { - auto chunk_range = - typename Bins::range_t{addr_to_chunk(addr), size_chunks}; - size_t bin = bitmap.add(chunk_range); + auto range = typename Bins::range_t{addr, size}; + size_t bin = Bins::bin_index(range); bin_trees[bin].remove_elem(addr); - if (size_chunks >= 2) + if (size >= TWO_UNITS) range_tree.remove_elem(addr); if (bin_trees[bin].is_empty()) bitmap.clear(bin); @@ -167,22 +163,29 @@ namespace snmalloc constexpr Arena() = default; /** - * Add a free block at `addr` with `size_chunks` chunks. The block - * is consolidated with any adjacent free neighbours. Returns - * `{0, 0}` on success. If consolidation produces a block spanning - * the entire arena (`>= 2^MAX_CHUNKS_BITS` chunks), returns - * `{consolidated_addr, consolidated_size}` and the arena is empty. + * Add a free block at `addr` with `size` bytes. The block is + * consolidated with any adjacent free neighbours. Returns + * `{0, 0}` on success. If consolidation produces a block whose + * size reaches `2^MAX_SIZE_BITS` bytes (the exclusive upper bound + * on representable block sizes), the block is not inserted; + * returns `{consolidated_addr, consolidated_size}` so the caller + * can return it to a parent range. */ - stl::Pair add_block(addr_t addr, size_t size_chunks) + stl::Pair add_block(addr_t addr, size_t size) { check_invariant(); SNMALLOC_ASSERT(addr != 0); - SNMALLOC_ASSERT((addr & (MIN_CHUNK_SIZE - 1)) == 0); - SNMALLOC_ASSERT(size_chunks > 0); - SNMALLOC_ASSERT(size_chunks < bits::one_at_bit(MAX_CHUNKS_BITS)); + // Unit alignment is required: callers feeding parent ranges (e.g. + // mmap-backed PalRange returns page-aligned but not chunk-aligned + // memory) must trim their input to UNIT_SIZE before reaching here. + // LargeArenaRange::add_range does this trim. + SNMALLOC_ASSERT((addr & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size > 0); + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(size < bits::one_at_bit(MAX_SIZE_BITS)); uintptr_t c_addr = addr; - size_t c_size = size_chunks; + size_t c_size = size; auto merge = [&](uintptr_t n_addr, size_t n_size) { unlink_block(n_addr, n_size); @@ -196,25 +199,25 @@ namespace snmalloc // Predecessor: check range tree, then fall back to min-size bin. auto [pa, ps] = range_from_addr(p_key); - if (pa + ps * MIN_CHUNK_SIZE == addr && Rep::can_consolidate(addr)) + if (pa + ps == addr && Rep::can_consolidate(addr)) merge(pa, ps); else if ( - addr >= MIN_CHUNK_SIZE && Rep::can_consolidate(addr) && - contains_min(addr - MIN_CHUNK_SIZE)) - merge(addr - MIN_CHUNK_SIZE, 1); + addr >= UNIT_SIZE && Rep::can_consolidate(addr) && + contains_min(addr - UNIT_SIZE)) + merge(addr - UNIT_SIZE, UNIT_SIZE); // Successor: check range tree, then fall back to min-size bin. auto [sa, ss] = range_from_addr(s_key); - uintptr_t succ_addr = addr + size_chunks * MIN_CHUNK_SIZE; + uintptr_t succ_addr = addr + size; if (sa == succ_addr && Rep::can_consolidate(succ_addr)) merge(sa, ss); else if ( succ_addr > addr && Rep::can_consolidate(succ_addr) && contains_min(succ_addr)) - merge(succ_addr, 1); + merge(succ_addr, UNIT_SIZE); // Arena-scale overflow: consolidated block spans the full arena. - if (c_size >= bits::one_at_bit(MAX_CHUNKS_BITS)) + if (c_size >= bits::one_at_bit(MAX_SIZE_BITS)) return {c_addr, c_size}; // Insert consolidated block. @@ -225,22 +228,26 @@ namespace snmalloc } /** - * Remove a block of at least `n_chunks` chunks. Returns - * `{addr, actual_size}` on success, `{0, 0}` if nothing fits. - * Any leftover from carving is re-inserted via `add_block`. + * Remove exactly `size` bytes. Returns the address on success or + * 0 if nothing fits. SC rounding is internal: the arena may + * locate a larger free region but only the requested `size` is + * handed out — the remainder rolls into the carve remainders + * which are re-inserted via `add_block`. */ - stl::Pair remove_block(size_t n_chunks) + addr_t remove_block(size_t size) { check_invariant(); - if (n_chunks == 0) - return {0, 0}; + if (size == 0) + return 0; - if (n_chunks > Bins::max_supported_chunks()) - return {0, 0}; + if (size > Bins::max_supported_size()) + return 0; + + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); - size_t bin_id = bitmap.find_for_request(n_chunks); + size_t bin_id = bitmap.find_for_request(size); if (bin_id == SIZE_MAX) - return {0, 0}; + return 0; // remove_min returns the lowest-address entry (since compare // is k1 > k2). Read metadata after removal — remove_elem @@ -249,30 +256,29 @@ namespace snmalloc auto [_, block_size] = range_from_addr(block_addr); (void)_; - if (block_size >= 2) + if (block_size >= TWO_UNITS) range_tree.remove_elem(block_addr); if (bin_trees[bin_id].is_empty()) bitmap.clear(bin_id); - // Carve the requested chunk count from the block. - auto carved = - Bins::carve({addr_to_chunk(block_addr), block_size}, n_chunks); + // Carve the requested size from the block. + auto carved = Bins::carve({block_addr, block_size}, size); // Re-insert non-empty remainders. By the maximally-consolidated // invariant, these remainders have no adjacent free neighbours. if (carved.pre.size != 0) { - insert_block(chunk_to_addr(carved.pre.base), carved.pre.size); + insert_block(carved.pre.base, carved.pre.size); } if (carved.post.size != 0) { - insert_block(chunk_to_addr(carved.post.base), carved.post.size); + insert_block(carved.post.base, carved.post.size); } check_invariant(); - return {chunk_to_addr(carved.req.base), carved.req.size}; + return carved.req.base; } /** @@ -309,7 +315,7 @@ namespace snmalloc auto [a, s] = range_from_addr(node); if (prev_valid) { - uintptr_t prev_end = prev_addr + prev_size * MIN_CHUNK_SIZE; + uintptr_t prev_end = prev_addr + prev_size; SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); } prev_addr = a; @@ -321,10 +327,10 @@ namespace snmalloc // 1b. No non-min block adjacent to a min block (unless boundary). range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - if (a >= MIN_CHUNK_SIZE) + if (a >= UNIT_SIZE) SNMALLOC_ASSERT( - !contains_min(a - MIN_CHUNK_SIZE) || !Rep::can_consolidate(a)); - uintptr_t end = a + s * MIN_CHUNK_SIZE; + !contains_min(a - UNIT_SIZE) || !Rep::can_consolidate(a)); + uintptr_t end = a + s; SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); }); @@ -337,7 +343,7 @@ namespace snmalloc return; if (prev_valid) SNMALLOC_ASSERT( - prev + MIN_CHUNK_SIZE != node || !Rep::can_consolidate(node)); + prev + UNIT_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; }); @@ -352,7 +358,7 @@ namespace snmalloc { bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - if (s >= 2) + if (s >= TWO_UNITS) { auto path = range_tree.get_root_path(); SNMALLOC_ASSERT(range_tree.find(path, node)); @@ -364,8 +370,8 @@ namespace snmalloc range_tree.for_each([&](uintptr_t node) { range_tree_count++; auto [a, s] = range_from_addr(node); - auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; - size_t expected_bin = Bins::bin_index(chunk_range); + auto range = typename Bins::range_t{a, s}; + size_t expected_bin = Bins::bin_index(range); auto path = bin_trees[expected_bin].get_root_path(); SNMALLOC_ASSERT(bin_trees[expected_bin].find(path, node)); }); @@ -378,8 +384,8 @@ namespace snmalloc { bin_trees[bin].for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); - auto chunk_range = typename Bins::range_t{addr_to_chunk(a), s}; - size_t expected_bin = Bins::bin_index(chunk_range); + auto range = typename Bins::range_t{a, s}; + size_t expected_bin = Bins::bin_index(range); SNMALLOC_ASSERT(expected_bin == bin); }); } @@ -398,9 +404,9 @@ namespace snmalloc bin_trees[bin].for_each([&](uintptr_t node) { auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); - SNMALLOC_ASSERT(v == variant_of(s, addr_to_chunk(a))); + SNMALLOC_ASSERT(v == variant_of(s, a)); if (v == ArenaVariant::Large) - SNMALLOC_ASSERT(Rep::get_large_size_chunks(node) == s); + SNMALLOC_ASSERT(Rep::get_large_size(node) == s); }); } } diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index e9a76253b..fdce2b143 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -7,14 +7,14 @@ namespace snmalloc { - template + template struct ArenaBinsTestAccess; /** - * Chunk size class enumeration and bin classification used by the + * Size class enumeration and bin classification used by the * Arena. * - * Template parameter B (mantissa-bit width of snmalloc's + * Template parameter `B` (mantissa-bit width of snmalloc's * non-power-of-two size class scheme) determines the number of * RB-trees per exponent — the count of distinct servable subsets a * free block can occupy at that exponent: B=1 -> 2; B=2 -> 5; @@ -22,27 +22,41 @@ namespace snmalloc * `prototype/skip_analysis.py`. All bin-scheme metadata derives * constexpr from a single per-bin subsets table, `bin_subsets`. * + * Template parameter `MIN_SIZE_BITS` is the log2 of the allocation + * unit: every byte size handled here is a multiple of + * `UNIT_SIZE = 1 << MIN_SIZE_BITS`, and the smallest representable + * size is `UNIT_SIZE`. With `MIN_SIZE_BITS == 0` the unit is a single + * byte and the classifier degenerates to the bare bin scheme; + * larger values scale the entire size axis (and the bin tables) + * by `UNIT_SIZE`. + * * Public surface: - * - `range_t`, `carve_t`: chunk-count ranges and carve output. - * - `carve(block, n_chunks)`: split a block into pre-pad / aligned - * request / post-pad. - * - `max_supported_chunks()`: upper bound on legal request sizes. + * - `range_t`, `carve_t`: byte ranges and carve output. + * - `carve(block, n)`: split a block into pre-pad / aligned + * request / post-pad, where `n` is in bytes. + * - `max_supported_size()`: upper bound on legal request sizes + * (in bytes). * - nested `Bitmap`: per-arena non-empty-bins bitmap with * `add` / `find_for_request` / `clear`. * * Everything else is private; tests reach it via - * `ArenaBinsTestAccess`. + * `ArenaBinsTestAccess`. */ - template + template class ArenaBins { static_assert( INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, "ArenaBins currently supports B in {1, 2, 3}"); + static_assert( + MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS, + "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one " + "exponent above the low regime so MAX_SC is non-trivial"); public: - /// (base, size) chunk-count range. `size == 0` means empty (base - /// is unspecified). + /// (base, size) byte range. Both fields are multiples of + /// `UNIT_SIZE = 1 << MIN_SIZE_BITS`. `size == 0` means empty + /// (base is unspecified). struct range_t { size_t base; @@ -59,10 +73,15 @@ namespace snmalloc }; private: - friend struct ArenaBinsTestAccess; + friend struct ArenaBinsTestAccess; static constexpr size_t B = INTERMEDIATE_BITS; + /// Size of the allocation unit. Every byte size handled by the + /// classifier is a multiple of this value, and the smallest + /// representable size is `UNIT_SIZE`. + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_SIZE_BITS; + /// Number of mantissa positions per regular exponent (= 2^B). static constexpr size_t MANTISSAS_PER_EXP = size_t(1) << B; @@ -74,11 +93,11 @@ namespace snmalloc 0; /// Size of the per-sc info tables. One past the largest raw id from - /// `bits::to_exp_mant_const` whose decoded size fits in - /// `size_t` (the architectural max raw id decodes to `2^bits::BITS`, - /// which overflows). + /// `bits::to_exp_mant_const` whose decoded size + /// fits in `size_t` (the architectural max raw id would decode to + /// `2^bits::BITS`, which overflows). static constexpr size_t MAX_SC = - ((bits::BITS - B) << B) + ((size_t(1) << B) - 1); + ((bits::BITS - B - MIN_SIZE_BITS) << B) + ((size_t(1) << B) - 1); /** * Per-SC bitmap-scan record, read by `Bitmap::find_for_request`. @@ -115,14 +134,15 @@ namespace snmalloc * Per-SC carve record, read by `carve` and by `bin_offset_at`'s * `fits` predicate (free-side cascade walk via `bin_index`). * - * - `size_chunks`: size this SC promises on allocation. - * - `align_chunks`: natural alignment (a power of two, derived - * from `size_chunks`). + * - `size`: byte size this SC promises on allocation (multiple + * of `UNIT_SIZE`). + * - `align`: natural byte alignment (a power of two, derived + * from `size`). */ struct carve_info_t { - size_t size_chunks; - size_t align_chunks; + size_t size; + size_t align; }; static_assert( @@ -133,16 +153,18 @@ namespace snmalloc /** * Map a request size to its bitmap-scan record. * - * `n_chunks` must be in `[1, max_supported_chunks()]`. - * Not `constexpr`: uses `bits::clz` intrinsic via `bits::to_exp_mant` - * to stay single-cycle on the fast path. + * `n` must be in `[UNIT_SIZE, max_supported_size()]` and a + * multiple of `UNIT_SIZE`. Not `constexpr`: uses `bits::clz` + * intrinsic via `bits::to_exp_mant` to stay single-cycle on the + * fast path. */ SNMALLOC_FAST_PATH static const bitmap_info_t& - bitmap_info_for_request(size_t n_chunks) + bitmap_info_for_request(size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); - size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); + size_t raw = bits::to_exp_mant(n); SNMALLOC_ASSERT(raw < MAX_SC); return table_.bitmap_info[raw]; } @@ -150,37 +172,47 @@ namespace snmalloc /// Map a request size to its carve record. Preconditions and /// properties as `bitmap_info_for_request`. SNMALLOC_FAST_PATH static const carve_info_t& - carve_info_for_request(size_t n_chunks) + carve_info_for_request(size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); - size_t raw = bits::to_exp_mant(n_chunks); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); + size_t raw = bits::to_exp_mant(n); SNMALLOC_ASSERT(raw < MAX_SC); return table_.carve_info[raw]; } public: /** - * Bin id of `block`. Operates on arbitrary chunk counts, not just - * exact size classes. `block.size` must be >= 1. + * Bin id of `block`. Operates on arbitrary byte sizes that are + * multiples of `UNIT_SIZE`, not just exact size classes. + * `block.size` must be at least `UNIT_SIZE`. * * A bin id at exponent `e` identifies the *servable set*: the * subset of SCs at `e` that `block` could serve. Two blocks with * the same servable set at the same exponent share a bin id. * - * The natural exponent is `e = prev_pow2_bits(block.size)`. If - * alignment padding eats every SC there, we drop to `e - 1`, - * which is guaranteed to fit: its smallest SC has size and - * alignment `2^(e-1)`, so worst-case `size + pad < 2^e <= - * block.size`. One drop is always enough. + * The natural byte exponent is `prev_pow2_bits(block.size)`, + * which ranges over `[MIN_SIZE_BITS, bits::BITS)` once the + * size is a multiple of `UNIT_SIZE`. The internal exponent + * `e` is normalised by subtracting `MIN_SIZE_BITS`, so bin + * 0 always corresponds to the `UNIT_SIZE` block. + * + * If alignment padding eats every SC at the natural exponent we + * drop to `e - 1`, which is guaranteed to fit: its smallest SC + * has size and alignment `UNIT_SIZE << (e - 1)`, so worst-case + * `size + pad < UNIT_SIZE << e <= block.size`. One drop is + * always enough. * * Not `constexpr`: uses `bits::clz` via `bits::prev_pow2_bits`. */ SNMALLOC_FAST_PATH static size_t bin_index(range_t block) { - SNMALLOC_ASSERT(block.size >= 1); + SNMALLOC_ASSERT(block.size >= UNIT_SIZE); + SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0); - size_t e = bits::prev_pow2_bits(block.size); + size_t e = bits::prev_pow2_bits(block.size) - MIN_SIZE_BITS; size_t offset = bin_offset_at(block.base, block.size, e); if (SNMALLOC_UNLIKELY(offset == BINS_PER_EXP)) { @@ -194,19 +226,30 @@ namespace snmalloc return table_.exp_bin_base[e] + offset; } - /// Largest `n_chunks` legal for `carve` / `Bitmap::find_for_request`. - static constexpr size_t max_supported_chunks() + /// Largest byte size legal for `carve` / `Bitmap::find_for_request`. + static constexpr size_t max_supported_size() { - return bits::from_exp_mant(MAX_SC - 1); + return bits::from_exp_mant(MAX_SC - 1); } /** - * Carve a free block into pre-pad / aligned request / post-pad. + * Carve a free block into pre-pad / aligned request / post-pad, + * delivering exactly `n` bytes to the caller. + * + * The carve_info for `n` is used only to find a valid alignment + * and to verify that the block has room: `req.base` is aligned + * to `info.align` (the natural alignment of the SC that covers + * `n`), and the block must contain `info.size` bytes from that + * point. Only `n` bytes are handed out, and the leftover + * `info.size - n` bytes roll into `post`. This keeps SC rounding + * as an arena-internal detail: callers always receive exactly + * what they asked for. * * Preconditions (caller must have used `Bitmap::find_for_request` * to locate a servable bin): - * - `block.size > 0`, `n_chunks` in `[1, max_supported_chunks()]`, - * `block` large enough to fit the SC after aligning up. + * - `block.size > 0`, `n` in `[UNIT_SIZE, max_supported_size()]` + * and a multiple of `UNIT_SIZE`, `block` large enough to fit + * the SC after aligning up. * - `block.base + block.size` does not wrap. * * Pure: does not touch the bitmap or any tree. Either or both @@ -215,30 +258,36 @@ namespace snmalloc * `req.base + req.size == post.base` (keeps caller adjacency * checks simple). */ - SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n_chunks) + SNMALLOC_FAST_PATH static carve_t carve(range_t block, size_t n) { - SNMALLOC_ASSERT(n_chunks >= 1); - SNMALLOC_ASSERT(n_chunks <= max_supported_chunks()); + SNMALLOC_ASSERT(n >= UNIT_SIZE); + SNMALLOC_ASSERT((n & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT(n <= max_supported_size()); SNMALLOC_ASSERT(block.size > 0); + SNMALLOC_ASSERT((block.size & (UNIT_SIZE - 1)) == 0); + SNMALLOC_ASSERT((block.base & (UNIT_SIZE - 1)) == 0); // Combined with the servability precondition, non-wrapping end // ensures the alignment-up below does not wrap either. SNMALLOC_ASSERT(block.base + block.size >= block.base); - const carve_info_t& info = carve_info_for_request(n_chunks); + const carve_info_t& info = carve_info_for_request(n); size_t req_base = - (block.base + (info.align_chunks - 1)) & ~(info.align_chunks - 1); + (block.base + (info.align - 1)) & ~(info.align - 1); size_t pre_size = req_base - block.base; + // Servability precondition: `info.size >= n` bytes fit after + // `pre`. We only hand out `n`; the remainder (`info.size - n`) + // joins `post`. SNMALLOC_ASSERT(pre_size <= block.size); - SNMALLOC_ASSERT(block.size - pre_size >= info.size_chunks); + SNMALLOC_ASSERT(block.size - pre_size >= info.size); - size_t post_base = req_base + info.size_chunks; + size_t post_base = req_base + n; size_t post_size = (block.base + block.size) - post_base; carve_t result; result.pre = {block.base, pre_size}; - result.req = {req_base, info.size_chunks}; + result.req = {req_base, n}; result.post = {post_base, post_size}; return result; } @@ -251,8 +300,8 @@ namespace snmalloc * Three-method API: * - `add(range_t)`: classify a block and set its bin's bit * (idempotent on the bit; returns the bin id). - * - `find_for_request(n_chunks)`: smallest set bin whose blocks - * all serve `n_chunks`, or `SIZE_MAX` if none. + * - `find_for_request(n)`: smallest set bin whose blocks + * all serve `n`, or `SIZE_MAX` if none. * - `clear(bin_id)`: mark empty. Caller must ensure the bin's * tree is actually empty; the bitmap does not track contents. * @@ -261,7 +310,7 @@ namespace snmalloc */ class Bitmap { - friend struct ArenaBinsTestAccess; + friend struct ArenaBinsTestAccess; public: /// Strict upper bound on bin ids `bin_index` produces. Exposed @@ -282,8 +331,8 @@ namespace snmalloc */ SNMALLOC_FAST_PATH size_t add(range_t block) { - SNMALLOC_ASSERT(block.size >= 1); - SNMALLOC_ASSERT(block.size <= max_supported_chunks()); + SNMALLOC_ASSERT(block.size >= UNIT_SIZE); + SNMALLOC_ASSERT(block.size <= max_supported_size()); size_t bin_id = bin_index(block); SNMALLOC_ASSERT(bin_id < TOTAL_BINS); words_[bin_id / bits::BITS] |= @@ -310,17 +359,18 @@ namespace snmalloc } /** - * Smallest bin id whose set blocks all serve `n_chunks`, or - * `SIZE_MAX` if none. `n_chunks` in `[1, max_supported_chunks()]`. + * Smallest bin id whose set blocks all serve `n`, or `SIZE_MAX` + * if none. `n` in `[UNIT_SIZE, max_supported_size()]` and a + * multiple of `UNIT_SIZE`. * * Invariant (static_assert below): `BINS_PER_EXP <= bits::BITS`, * so the within-exponent range fits inside one word and the * search straddles at most one word boundary. After the second * word, every remaining word is purely higher-exponent. */ - SNMALLOC_FAST_PATH size_t find_for_request(size_t n_chunks) const + SNMALLOC_FAST_PATH size_t find_for_request(size_t n) const { - const bitmap_info_t& info = bitmap_info_for_request(n_chunks); + const bitmap_info_t& info = bitmap_info_for_request(n); SNMALLOC_ASSERT(info.start_word < NUM_BITMAP_WORDS); SNMALLOC_ASSUME(info.start_word < NUM_BITMAP_WORDS); @@ -561,9 +611,10 @@ namespace snmalloc }(); /** - * Within-exponent bin offset for a block at `addr_chunks` of length - * `n_chunks` at exponent `e`. Returns `BINS_PER_EXP` (sentinel) if - * no mantissa at this exponent fits. + * Within-exponent bin offset for a block at byte address `addr` + * of byte length `n` at internal exponent `e`. Returns + * `BINS_PER_EXP` (sentinel) if no mantissa at this exponent + * fits. * * Walks `m_top` from `MANTISSAS_PER_EXP - 1` down. The first * fitting `m_top` is the largest mantissa this block can serve; @@ -580,7 +631,7 @@ namespace snmalloc * exponent and 1 at the fallback exponent. */ SNMALLOC_FAST_PATH static size_t - bin_offset_at(size_t addr_chunks, size_t n_chunks, size_t e) + bin_offset_at(size_t addr, size_t n, size_t e) { size_t first = table_.exp_first_sc[e]; size_t past = table_.exp_first_sc[e + 1]; @@ -593,13 +644,13 @@ namespace snmalloc if (first + m >= past) return false; const carve_info_t& ci = table_.carve_info[first + m]; - // Optimisation: near the bottom of n_chunks's exponent range - // the higher-mantissa sizes already exceed n_chunks and cannot - // fit regardless of alignment. Skips the align_up below. - if (n_chunks < ci.size_chunks) + // Optimisation: near the bottom of n's exponent range the + // higher-mantissa sizes already exceed n and cannot fit + // regardless of alignment. Skips the align_up below. + if (n < ci.size) return false; - size_t pad = bits::align_up(addr_chunks, ci.align_chunks) - addr_chunks; - return n_chunks - ci.size_chunks >= pad; + size_t pad = bits::align_up(addr, ci.align) - addr; + return n - ci.size >= pad; }; for (size_t m_top = MANTISSAS_PER_EXP; m_top-- > 0;) @@ -656,18 +707,23 @@ namespace snmalloc // the only place that knows the size class encoding; once we've // pinned down the raw boundaries, everything else is table lookup. // + // `e` here is the internal (normalised) exponent: an SC's + // `e == 0` corresponds to byte size `UNIT_SIZE = 1 << MIN_SIZE_BITS`. + // // Note: `exp_first_sc` does NOT have a uniform stride. At the // bottom of the encoding the low regime (no leading-1 bit; the // `b = (e == 0) ? 0 : 1` branch in `to_exp_mant_const`) squashes - // multiple ArenaBins exponents into encoded-exponent 0. + // multiple internal exponents into encoded-exponent 0. // For `B = 2` the counts are 1, 2, 4, 4, 4, ... - for (size_t e = 0; e < bits::BITS; e++) + constexpr size_t MAX_E = bits::BITS - MIN_SIZE_BITS; + for (size_t e = 0; e < MAX_E; e++) { - exp_first_sc[e] = bits::to_exp_mant_const(size_t(1) << e); + exp_first_sc[e] = + bits::to_exp_mant_const(size_t(1) << (e + MIN_SIZE_BITS)); exp_bin_base[e] = e * BINS_PER_EXP; } - exp_first_sc[bits::BITS] = MAX_SC; - exp_bin_base[bits::BITS] = bits::BITS * BINS_PER_EXP; + exp_first_sc[MAX_E] = MAX_SC; + exp_bin_base[MAX_E] = MAX_E * BINS_PER_EXP; // Per-sc records. Size and alignment come straight from the // size-class scheme (via from_exp_mant); start_word, first_mask, @@ -676,14 +732,14 @@ namespace snmalloc // the search hot path is two ANDs. for (size_t sc = 0; sc < MAX_SC; sc++) { - size_t size = bits::from_exp_mant(sc); - size_t e = bits::prev_pow2_bits_const(size); + size_t size = bits::from_exp_mant(sc); + size_t e = bits::prev_pow2_bits_const(size) - MIN_SIZE_BITS; size_t m = sc - exp_first_sc[e]; size_t start_bit = exp_bin_base[e] + start_bin_offset_for_m(m); size_t mask = serve_mask_for_m(m); size_t shift = start_bit & (bits::BITS - 1); - carve_info[sc].size_chunks = size; - carve_info[sc].align_chunks = size & (~size + 1); + carve_info[sc].size = size; + carve_info[sc].align = size & (~size + 1); bitmap_info[sc].start_word = start_bit / bits::BITS; bitmap_info[sc].first_mask = mask << shift; // shift == 0: no within-exponent carry; the second word is diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index ee339337b..5311499df 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -9,6 +9,7 @@ #include "empty_range.h" #include "globalrange.h" #include "indirectrange.h" +#include "largearenarange.h" #include "largebuddyrange.h" #include "logrange.h" #include "noprange.h" diff --git a/src/snmalloc/backend_helpers/largearenarange.h b/src/snmalloc/backend_helpers/largearenarange.h new file mode 100644 index 000000000..c5eae63c7 --- /dev/null +++ b/src/snmalloc/backend_helpers/largearenarange.h @@ -0,0 +1,380 @@ +#pragma once + +#include "arena.h" +#include "empty_range.h" +#include "range_helpers.h" + +namespace snmalloc +{ + /** + * PagemapRep — Rep for `Arena` over a Pagemap. + * + * Each free block uses three pagemap entries at unit-aligned offsets: + * + * Unit 0 (addr): bin-tree node + variant tag. + * Unit 1 (addr + UNIT_SIZE): range-tree node (size ≥ 2 units). + * Unit 2 (addr + 2*UNIT_SIZE): large chunk count (size ≥ 3 units). + * + * Bit-layout decisions for tree nodes are private to this class: + * - Bits 0–7 of each pagemap word are reserved by the pagemap. + * - Bit 8 is the red bit (both trees). + * - Bits 9–10 of Word::One at unit 0 hold the variant tag. + * - Large chunk count is stored shifted left by 8 in Word::One of + * unit 2. + * + * `MIN_SIZE_BITS` is the log2 size of the allocation unit (= pagemap + * stride); the caller passes whatever unit it uses (snmalloc's global + * `MIN_CHUNK_BITS` in the in-tree pipeline). + * `MAX_SIZE_BITS` is the log2 of the (exclusive) upper bound on block + * size in bytes; used here only to verify that the largest chunk + * count fits in a shifted pagemap word. + */ + template< + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_SIZE_BITS, + size_t MAX_SIZE_BITS> + class PagemapRep + { + using Entry = typename Pagemap::Entry; + + static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; + + // Bit positions inside a pagemap word. Bits 0–7 are reserved by the + // pagemap; tree-node and large-size encodings start at bit 8. + static constexpr unsigned RED_BIT_POS = 8; + static constexpr unsigned VARIANT_SHIFT = 9; + static constexpr unsigned VARIANT_BITS = 2; + + // Shift used to encode the large-size chunk count in Word::One of + // unit 2. + static constexpr size_t LARGE_SIZE_SHIFT = 8; + + static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; + static constexpr uintptr_t VARIANT_MASK = + ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT; + static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t RANGE_META_MASK = RED_BIT; + + static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); + static_assert( + (MAX_SIZE_BITS - MIN_SIZE_BITS) + LARGE_SIZE_SHIFT <= bits::BITS, + "Shifted large-size field must fit in a pagemap word."); + static_assert((RED_BIT & VARIANT_MASK) == 0); + static_assert(BIN_META_MASK < UNIT_SIZE); + static_assert( + Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK)); + static_assert(Entry::is_backend_allowed_value(Entry::Word::Two, RED_BIT)); + + using Word = typename Entry::Word; + using Handle = typename Entry::BackendStateWordRef; + + /** + * Pagemap word for the `UnitIdx`-th unit of the block at `addr`. + * Centralises the layout decision "which pagemap entry encodes + * data for unit i". Used by `TreeRep::ref` and by the variant / + * large-size accessors below. + */ + template + static Handle word_at(uintptr_t addr, Word w) + { + auto& entry = Pagemap::template get_metaentry_mut( + address_cast(addr + UnitIdx * UNIT_SIZE)); + return entry.get_backend_word(w); + } + + /** + * RBTree Rep shared by `BinRep` and `RangeRep`. `UnitIdx` selects + * which unit (0 or 1) of the block holds this Rep's tree node; the + * Rep's pagemap words live at `addr + UnitIdx * UNIT_SIZE`. + * `MetaMask` covers the bits in that node's words that are owned by + * this Rep (red + any tag bits) and must be preserved by get/set. + */ + template + struct TreeRep + { + using Handle = PagemapRep::Handle; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Handle ref(bool direction, Contents k) + { + static const Contents null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{const_cast(&null_entry)}; + return word_at(k, direction ? Word::One : Word::Two); + } + + static Contents get(Handle h) + { + return h.get() & ~MetaMask; + } + + static void set(Handle h, Contents v) + { + h = v | (h.get() & MetaMask); + } + + static bool is_red(Contents k) + { + return (ref(true, k).get() & RED_BIT) == RED_BIT; + } + + static void set_red(Contents k, bool new_is_red) + { + if (new_is_red != is_red(k)) + { + auto h = ref(true, k); + h = h.get() ^ RED_BIT; + } + SNMALLOC_ASSERT(is_red(k) == new_is_red); + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return Name; + } + }; + + static constexpr char BIN_REP_NAME[] = "PagemapBinRep"; + static constexpr char RANGE_REP_NAME[] = "PagemapRangeRep"; + + public: + using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; + using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; + + static ArenaVariant get_variant(uintptr_t addr) + { + auto w = word_at<0>(addr, Word::One); + return static_cast( + (w.get() & VARIANT_MASK) >> VARIANT_SHIFT); + } + + static void set_variant(uintptr_t addr, ArenaVariant v) + { + auto w = word_at<0>(addr, Word::One); + w = (w.get() & ~VARIANT_MASK) | + (static_cast(v) << VARIANT_SHIFT); + } + + static size_t get_large_size(uintptr_t addr) + { + // Stored as chunk count to keep the shifted field within a + // pagemap word (see LARGE_SIZE_SHIFT static_assert). Returns + // the byte size. + return (word_at<2>(addr, Word::One).get() >> LARGE_SIZE_SHIFT) + << MIN_SIZE_BITS; + } + + static void set_large_size(uintptr_t addr, size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + word_at<2>(addr, Word::One) = (size >> MIN_SIZE_BITS) << LARGE_SIZE_SHIFT; + } + + static bool can_consolidate(uintptr_t higher_addr) + { + auto& entry = + Pagemap::template get_metaentry_mut(address_cast(higher_addr)); + return !entry.is_boundary(); + } + }; + + /** + * Range wrapper around Arena, presenting the standard + * Range interface for use in Pipe<...> compositions. + */ + template< + size_t REFILL_SIZE_BITS, + size_t MAX_SIZE_BITS, + SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, + size_t MIN_REFILL_SIZE_BITS = 0> + class LargeArenaRange + { + static_assert( + REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS"); + static_assert( + MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS, + "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS"); + + static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS); + static constexpr size_t MIN_REFILL_SIZE = + bits::one_at_bit(MIN_REFILL_SIZE_BITS); + + public: + template> + class Type : public ContainsParent + { + using ContainsParent::parent; + + using PagemapRepT = PagemapRep; + + Arena arena; + size_t requested_total = 0; + + void parent_dealloc(uintptr_t addr, size_t size) + { + if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) + { + auto base = + capptr::Arena::unsafe_from(reinterpret_cast(addr)); + parent.dealloc_range(base, size); + } + else + { + SNMALLOC_CHECK_MSG(false, "Global range overflow should not happen"); + } + } + + void add_range(capptr::Arena base, size_t length) + { + // Parent ranges (e.g. mmap-backed PalRange) may return regions + // that are page-aligned but not chunk-aligned; trim to chunk + // boundaries on both ends before handing to the arena. + uintptr_t lo = bits::align_up(base.unsafe_uintptr(), MIN_CHUNK_SIZE); + uintptr_t hi = + bits::align_down(base.unsafe_uintptr() + length, MIN_CHUNK_SIZE); + if (lo >= hi) + return; + auto [ov_addr, ov_size] = arena.add_block(lo, hi - lo); + if (ov_addr != 0) + parent_dealloc(ov_addr, ov_size); + } + + capptr::Arena refill(size_t size) + { + if (ParentRange::Aligned) + { + size_t refill_size = bits::min(REFILL_SIZE, requested_total); + refill_size = bits::max(refill_size, MIN_REFILL_SIZE); + refill_size = bits::max(refill_size, size); + refill_size = bits::next_pow2(refill_size); + + auto refill_range = parent.alloc_range(refill_size); + if (refill_range != nullptr) + { + requested_total += refill_size; + add_range(pointer_offset(refill_range, size), refill_size - size); + } + return refill_range; + } + + bool overflow = false; + size_t needed_size = bits::umul(size, 2, overflow); + if (overflow) + { + return nullptr; + } + + auto refill_size = bits::max(needed_size, REFILL_SIZE); + while (needed_size <= refill_size) + { + auto refill_range = parent.alloc_range(refill_size); + + if (refill_range != nullptr) + { + requested_total += refill_size; + add_range(refill_range, refill_size); + + SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS)); + static_assert( + (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || + ParentRange::Aligned, + "Required to prevent overflow."); + + return alloc_range(size); + } + + refill_size >>= 1; + } + + return nullptr; + } + + public: + static constexpr bool Aligned = true; + static constexpr bool ConcurrencySafe = false; + using ChunkBounds = capptr::bounds::Arena; + static_assert( + stl::is_same_v); + + constexpr Type() = default; + + /** + * `size` exceeds the arena's representable range and must be + * routed to the parent (or refused if no parent exists). Matches + * `Arena::add_block`'s `size < bits::one_at_bit(MAX_SIZE_BITS)` + * precondition exactly, so alloc and dealloc bypass on the same + * boundary. + */ + static constexpr bool is_too_large(size_t size) + { + return size >= bits::one_at_bit(MAX_SIZE_BITS); + } + + capptr::Arena alloc_range(size_t size) + { + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0); + + if (is_too_large(size)) + { + if (ParentRange::Aligned) + return parent.alloc_range(size); + + return nullptr; + } + + uintptr_t addr = arena.remove_block(size); + if (addr != 0) + { + return capptr::Arena::unsafe_from( + reinterpret_cast(addr)); + } + + return refill(size); + } + + void dealloc_range(capptr::Arena base, size_t size) + { + SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); + SNMALLOC_ASSERT((size & (MIN_CHUNK_SIZE - 1)) == 0); + + if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) + { + if (is_too_large(size)) + { + parent_dealloc(base.unsafe_uintptr(), size); + return; + } + } + + auto [ov_addr, ov_size] = + arena.add_block(base.unsafe_uintptr(), size); + if (ov_addr != 0) + parent_dealloc(ov_addr, ov_size); + } + }; + }; +} // namespace snmalloc diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 1ae4cd738..291c35377 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -64,7 +64,7 @@ namespace snmalloc // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold - // range-tree children. variant and large_size_chunks hold metadata. + // range-tree children. variant and large_size hold metadata. struct mock_entry { uintptr_t word1{0}; @@ -72,7 +72,7 @@ namespace snmalloc uintptr_t range_word1{0}; uintptr_t range_word2{0}; ArenaVariant variant{ArenaVariant::Min}; - size_t large_size_chunks{0}; + size_t large_size{0}; }; // Size the array for the largest test arena + trailing room. @@ -96,7 +96,7 @@ namespace snmalloc // Inner RBTree Rep used by both MockRep::BinRep and MockRep::RangeRep. // Tag selects which pair of fields in mock_entry holds the tree pointers. // The red bit is packed into bit 8 of the stored word (matching the - // production PagemapRep layout, but defined privately here). + // PagemapRep layout, but defined privately here). template struct MockTreeRep { @@ -106,7 +106,8 @@ namespace snmalloc static constexpr Contents null = 0; static constexpr Contents root = 0; - static constexpr uintptr_t RED_BIT = uintptr_t(1) << 8; + static constexpr unsigned RED_BIT_POS = 8; + static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; static_assert(RED_BIT < MIN_CHUNK_SIZE); static Handle ref(bool direction, Contents k) @@ -186,14 +187,14 @@ namespace snmalloc mock_store[mock_index(addr)].variant = v; } - static size_t get_large_size_chunks(uintptr_t addr) + static size_t get_large_size(uintptr_t addr) { - return mock_store[mock_index(addr)].large_size_chunks; + return mock_store[mock_index(addr)].large_size; } - static void set_large_size_chunks(uintptr_t addr, size_t s) + static void set_large_size(uintptr_t addr, size_t s) { - mock_store[mock_index(addr)].large_size_chunks = s; + mock_store[mock_index(addr)].large_size = s; } static bool can_consolidate(uintptr_t) @@ -230,12 +231,19 @@ namespace snmalloc return static_cast(chunk_idx) << MIN_CHUNK_BITS; } + // Convenience: byte size from chunk count. + static constexpr size_t chunk_size(size_t n_chunks) + { + return n_chunks << MIN_CHUNK_BITS; + } + // ---- Test types ---- + // K = number of address bits the arena covers above MIN_CHUNK_BITS. // K=6 → arena of 64 chunks, K=8 → 256 chunks, K=10 → 1024 chunks. template - using TestArena = Arena; + using TestArena = Arena; - using Bins = ArenaBins<2>; + using Bins = ArenaBins<2, MIN_CHUNK_BITS>; // ================================================================== // (A) Accessor round-trips @@ -270,8 +278,8 @@ namespace snmalloc size_t{255}, size_t{1000}}) { - MockRep::set_large_size_chunks(a, s); - SNMALLOC_ASSERT(MockRep::get_large_size_chunks(a) == s); + MockRep::set_large_size(a, s); + SNMALLOC_ASSERT(MockRep::get_large_size(a) == s); } printf(" Large-size round-trip: OK\n"); @@ -323,28 +331,28 @@ namespace snmalloc uintptr_t a2 = chunk_addr(20); uintptr_t a3 = chunk_addr(30); - arena.add_block(a1, 3); + arena.add_block(a1, chunk_size(3)); arena.check_invariant(true); - arena.add_block(a2, 5); + arena.add_block(a2, chunk_size(5)); arena.check_invariant(true); - arena.add_block(a3, 1); + arena.add_block(a3, chunk_size(1)); arena.check_invariant(true); // Remove them. - auto r1 = arena.remove_block(1); - SNMALLOC_ASSERT(r1.first != 0); + auto r1 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r1 != 0); UNUSED(r1); arena.check_invariant(true); - auto r2 = arena.remove_block(3); - SNMALLOC_ASSERT(r2.first != 0); + auto r2 = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r2 != 0); UNUSED(r2); arena.check_invariant(true); - auto r3 = arena.remove_block(5); - SNMALLOC_ASSERT(r3.first != 0); + auto r3 = arena.remove_block(chunk_size(5)); + SNMALLOC_ASSERT(r3 != 0); UNUSED(r3); arena.check_invariant(true); @@ -386,7 +394,7 @@ namespace snmalloc for (auto& b : blocks) { - auto result = arena.add_block(chunk_addr(b.chunk_idx), b.size); + auto result = arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -404,24 +412,23 @@ namespace snmalloc TestArena<8> arena; // Insert 3 blocks of size 5 at non-adjacent locations. - arena.add_block(chunk_addr(10), 5); - arena.add_block(chunk_addr(20), 5); - arena.add_block(chunk_addr(30), 5); + arena.add_block(chunk_addr(10), chunk_size(5)); + arena.add_block(chunk_addr(20), chunk_size(5)); + arena.add_block(chunk_addr(30), chunk_size(5)); arena.check_invariant(true); // Remove 3 exact-size blocks. for (int i = 0; i < 3; i++) { - auto r = arena.remove_block(5); - SNMALLOC_ASSERT(r.first != 0); - SNMALLOC_ASSERT(r.second == 5); + auto r = arena.remove_block(chunk_size(5)); + SNMALLOC_ASSERT(r != 0); UNUSED(r); arena.check_invariant(true); } // Arena should be empty now. - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first == 0); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r == 0); UNUSED(r); printf(" remove_block exact: OK\n"); @@ -433,32 +440,32 @@ namespace snmalloc TestArena<8> arena; // Insert one block of size 10. - arena.add_block(chunk_addr(10), 10); + arena.add_block(chunk_addr(10), chunk_size(10)); arena.check_invariant(true); - // Request size 3 — should carve from the 10-chunk block. - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first != 0); - // The carved piece should be exactly what Bins::carve produces. - auto carved = Bins::carve({10, 10}, 3); - SNMALLOC_ASSERT(r.second == carved.req.size); + // Request size 3 chunks — should carve from the 10-chunk block. + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r != 0); + // The carved piece's address should match what Bins::carve produces. + auto carved = Bins::carve({chunk_addr(10), chunk_size(10)}, chunk_size(3)); UNUSED(r); arena.check_invariant(true); // The remainders should still be in the arena. // We can try to remove everything that's left. - size_t remaining = 10 - carved.req.size; + size_t remaining = chunk_size(10) - carved.req.size; while (remaining > 0) { - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first != 0); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 != 0); + UNUSED(r2); arena.check_invariant(true); - remaining -= r2.second; + remaining -= chunk_size(1); } // Should be empty. - auto r3 = arena.remove_block(1); - SNMALLOC_ASSERT(r3.first == 0); + auto r3 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r3 == 0); UNUSED(r3); printf(" remove_block carving: OK\n"); @@ -469,11 +476,12 @@ namespace snmalloc // ================================================================== // Helper: insert a block, verify invariant, return nothing. - template + // `size_in_chunks` is a chunk count; converted to bytes internally. + template static void - add_and_check(TestArena& arena, size_t chunk_idx, size_t size_chunks) + add_and_check(ArenaT& arena, size_t chunk_idx, size_t size_in_chunks) { - auto result = arena.add_block(chunk_addr(chunk_idx), size_chunks); + auto result = arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -481,16 +489,16 @@ namespace snmalloc // Drain the arena by removing 1-chunk blocks until empty. // Returns the total chunks removed. - template - static size_t drain_arena(TestArena& arena) + template + static size_t drain_arena(ArenaT& arena) { size_t total = 0; while (true) { - auto r = arena.remove_block(1); - if (r.first == 0) + auto r = arena.remove_block(chunk_size(1)); + if (r == 0) break; - total += r.second; + total += 1; arena.check_invariant(true); } return total; @@ -632,13 +640,13 @@ namespace snmalloc TestArena<8> arena; // Odd address: chunk 11, size 2 - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); SNMALLOC_ASSERT( MockRep::get_variant(chunk_addr(11)) == ArenaVariant::OddTwo); arena.check_invariant(true); // Even address: chunk 20, size 2 - arena.add_block(chunk_addr(20), 2); + arena.add_block(chunk_addr(20), chunk_size(2)); SNMALLOC_ASSERT( MockRep::get_variant(chunk_addr(20)) == ArenaVariant::EvenTwo); arena.check_invariant(true); @@ -670,17 +678,17 @@ namespace snmalloc TestArena<8> arena; // Add OddTwo block at chunk 11 (odd, size 2). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add a size-1 block at chunk 14, non-adjacent. - arena.add_block(chunk_addr(14), 1); + arena.add_block(chunk_addr(14), chunk_size(1)); arena.check_invariant(true); // Now add chunk 13 (size 1). Its successor check should NOT // pick up chunk 11's OddTwo entry via contains_min. It should // just insert as size 1. - arena.add_block(chunk_addr(13), 1); + arena.add_block(chunk_addr(13), chunk_size(1)); arena.check_invariant(true); // Chunk 13 should consolidate with chunk 14 (min successor), @@ -700,19 +708,18 @@ namespace snmalloc TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add adjacent block at chunk 13 (size 1). // Range tree finds OddTwo at 11 as predecessor? No — chunk 13's // predecessor in range tree is chunk 11 (size 2, ends at 13). // So they should consolidate into size 3 at chunk 11. - arena.add_block(chunk_addr(13), 1); + arena.add_block(chunk_addr(13), chunk_size(1)); arena.check_invariant(true); - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first == chunk_addr(11)); - SNMALLOC_ASSERT(r.second == 3); + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r == chunk_addr(11)); UNUSED(r); printf(" OddTwo consolidation (successor): OK\n"); @@ -725,17 +732,16 @@ namespace snmalloc TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2 → chunks 11-12). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Add block at chunk 10 (size 1). OddTwo at 11 is the successor // in the range tree → consolidate into size 3 at chunk 10. - arena.add_block(chunk_addr(10), 1); + arena.add_block(chunk_addr(10), chunk_size(1)); arena.check_invariant(true); - auto r = arena.remove_block(3); - SNMALLOC_ASSERT(r.first == chunk_addr(10)); - SNMALLOC_ASSERT(r.second == 3); + auto r = arena.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(r == chunk_addr(10)); UNUSED(r); printf(" OddTwo consolidation (predecessor): OK\n"); @@ -748,24 +754,22 @@ namespace snmalloc TestArena<8> arena; // Add OddTwo at chunk 11 (odd, size 2). - arena.add_block(chunk_addr(11), 2); + arena.add_block(chunk_addr(11), chunk_size(2)); arena.check_invariant(true); // Remove 1 chunk. Should carve from the OddTwo block. - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first != 0); - SNMALLOC_ASSERT(r.second == 1); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r != 0); arena.check_invariant(true); // The remainder (1 chunk) should be Min variant. - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first != 0); - SNMALLOC_ASSERT(r2.second == 1); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 != 0); UNUSED(r, r2); // Arena should be empty now. - auto r3 = arena.remove_block(1); - SNMALLOC_ASSERT(r3.first == 0); + auto r3 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r3 == 0); UNUSED(r3); printf(" OddTwo remove + carve: OK\n"); @@ -785,7 +789,7 @@ namespace snmalloc // Step 1: add even-indexed chunks as individual blocks (8 blocks). for (size_t i = 0; i < 16; i += 2) { - arena.add_block(chunk_addr(BASE + i), 1); + arena.add_block(chunk_addr(BASE + i), chunk_size(1)); arena.check_invariant(true); } @@ -793,7 +797,7 @@ namespace snmalloc // even-indexed neighbours. The last add completes the arena. for (size_t i = 1; i < 16; i += 2) { - arena.add_block(chunk_addr(BASE + i), 1); + arena.add_block(chunk_addr(BASE + i), chunk_size(1)); // Don't check invariant on the last add — it returns overflow. if (i < 15) { @@ -802,8 +806,8 @@ namespace snmalloc } // The last add should have triggered overflow (16 chunks = 2^4). - auto r = arena.remove_block(1); - SNMALLOC_ASSERT(r.first == 0); + auto r = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r == 0); UNUSED(r); printf(" Overflow (arena-scale consolidation): OK\n"); @@ -817,17 +821,17 @@ namespace snmalloc constexpr size_t BASE = 16; - arena.add_block(chunk_addr(BASE), 8); + arena.add_block(chunk_addr(BASE), chunk_size(8)); arena.check_invariant(true); // Adding [BASE+8, BASE+16) consolidates to 16 chunks = 2^4 → overflow. - auto r = arena.add_block(chunk_addr(BASE + 8), 8); + auto r = arena.add_block(chunk_addr(BASE + 8), chunk_size(8)); SNMALLOC_ASSERT(r.first == chunk_addr(BASE)); - SNMALLOC_ASSERT(r.second == 16); + SNMALLOC_ASSERT(r.second == chunk_size(16)); UNUSED(r); - auto r2 = arena.remove_block(1); - SNMALLOC_ASSERT(r2.first == 0); + auto r2 = arena.remove_block(chunk_size(1)); + SNMALLOC_ASSERT(r2 == 0); UNUSED(r2); printf(" Overflow precise: OK\n"); @@ -897,23 +901,25 @@ namespace snmalloc // addr_chunks is oracle-relative (without base offset). std::pair remove(size_t n_chunks) { - if (n_chunks == 0 || n_chunks > Bins::max_supported_chunks()) + size_t n_bytes = n_chunks << MIN_CHUNK_BITS; + if (n_bytes == 0 || n_bytes > Bins::max_supported_size()) return {0, 0}; // Mirror the arena exactly: build a bitmap using arena-offset - // addresses (so bin classification matches), then find_for_request. + // byte addresses (so bin classification matches), then find_for_request. typename Bins::Bitmap bm{}; std::map::iterator>> by_bin; for (auto it = ranges.begin(); it != ranges.end(); ++it) { - // Use base-offset address for bin classification. - Bins::range_t r{base_offset + it->addr, it->size}; + typename Bins::range_t r{ + (base_offset + it->addr) << MIN_CHUNK_BITS, + it->size << MIN_CHUNK_BITS}; size_t bin = bm.add(r); by_bin[bin].push_back(it); } - size_t bin_id = bm.find_for_request(n_chunks); + size_t bin_id = bm.find_for_request(n_bytes); if (bin_id == SIZE_MAX) return {0, 0}; @@ -928,15 +934,22 @@ namespace snmalloc OracleRange block = *best_it; ranges.erase(best_it); - // Carve using base-offset address. - auto carved = - Bins::carve({base_offset + block.addr, block.size}, n_chunks); + auto carved = Bins::carve( + {(base_offset + block.addr) << MIN_CHUNK_BITS, + block.size << MIN_CHUNK_BITS}, + n_bytes); if (carved.pre.size != 0) - ranges.insert({carved.pre.base - base_offset, carved.pre.size}); + ranges.insert( + {(carved.pre.base >> MIN_CHUNK_BITS) - base_offset, + carved.pre.size >> MIN_CHUNK_BITS}); if (carved.post.size != 0) - ranges.insert({carved.post.base - base_offset, carved.post.size}); + ranges.insert( + {(carved.post.base >> MIN_CHUNK_BITS) - base_offset, + carved.post.size >> MIN_CHUNK_BITS}); - return {carved.req.base - base_offset, carved.req.size}; + return { + (carved.req.base >> MIN_CHUNK_BITS) - base_offset, + carved.req.size >> MIN_CHUNK_BITS}; } bool empty() const @@ -1019,7 +1032,7 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) allocated[j] = false; - auto result = arena.add_block(chunk_addr(BASE + start), size); + auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1043,23 +1056,20 @@ namespace snmalloc max_req = 1; size_t n = (rng.next() % max_req) + 1; - auto arena_result = arena.remove_block(n); + auto arena_result = arena.remove_block(chunk_size(n)); auto oracle_result = oracle.remove(n); UNUSED(arena_result); // Both should agree on success/failure. - // Use size == 0 to detect failure, since oracle address 0 is valid. if (oracle_result.second == 0) { - SNMALLOC_ASSERT(arena_result.second == 0); + SNMALLOC_ASSERT(arena_result == 0); } else { - SNMALLOC_ASSERT(arena_result.second != 0); - // Both should return the same address and size. + SNMALLOC_ASSERT(arena_result != 0); SNMALLOC_ASSERT( - arena_result.first == chunk_addr(BASE + oracle_result.first)); - SNMALLOC_ASSERT(arena_result.second == oracle_result.second); + arena_result == chunk_addr(BASE + oracle_result.first)); // Mark as allocated. size_t start = oracle_result.first; @@ -1099,26 +1109,26 @@ namespace snmalloc constexpr size_t BASE = 256; // avoid address 0 // Add distinct blocks to each arena. - arena_a.add_block(chunk_addr(BASE + 10), 5); - arena_b.add_block(chunk_addr(BASE + 30), 5); + arena_a.add_block(chunk_addr(BASE + 10), chunk_size(5)); + arena_b.add_block(chunk_addr(BASE + 30), chunk_size(5)); arena_a.check_invariant(true); arena_b.check_invariant(true); // Migrate a block from A to B. - auto [a_addr, a_size] = arena_a.remove_block(3); - SNMALLOC_ASSERT(a_addr != 0 && a_size != 0); + uintptr_t a_addr = arena_a.remove_block(chunk_size(3)); + SNMALLOC_ASSERT(a_addr != 0); arena_a.check_invariant(true); - arena_b.add_block(a_addr, a_size); + arena_b.add_block(a_addr, chunk_size(3)); arena_a.check_invariant(true); arena_b.check_invariant(true); // Migrate from B back to A. - auto [b_addr, b_size] = arena_b.remove_block(2); - SNMALLOC_ASSERT(b_addr != 0 && b_size != 0); + uintptr_t b_addr = arena_b.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(b_addr != 0); arena_b.check_invariant(true); - arena_a.add_block(b_addr, b_size); + arena_a.add_block(b_addr, chunk_size(2)); arena_a.check_invariant(true); arena_b.check_invariant(true); @@ -1133,27 +1143,26 @@ namespace snmalloc constexpr size_t BASE = 256; // Arena B holds two blocks with a gap: [20..24) and [28..32). - arena_b.add_block(chunk_addr(BASE + 20), 4); - arena_b.add_block(chunk_addr(BASE + 28), 4); + arena_b.add_block(chunk_addr(BASE + 20), chunk_size(4)); + arena_b.add_block(chunk_addr(BASE + 28), chunk_size(4)); arena_b.check_invariant(true); // Arena A holds the gap: [24..28). - arena_a.add_block(chunk_addr(BASE + 24), 4); + arena_a.add_block(chunk_addr(BASE + 24), chunk_size(4)); arena_a.check_invariant(true); // Migrate the gap from A to B → should consolidate into [20..32). - auto [addr, size] = arena_a.remove_block(4); + uintptr_t addr = arena_a.remove_block(chunk_size(4)); SNMALLOC_ASSERT(addr == chunk_addr(BASE + 24)); - SNMALLOC_ASSERT(size == 4); arena_a.check_invariant(true); - arena_b.add_block(addr, size); + arena_b.add_block(addr, chunk_size(4)); arena_b.check_invariant(true); // B should now serve a size-12 request from the consolidated block. - auto [r_addr, r_size] = arena_b.remove_block(12); + uintptr_t r_addr = arena_b.remove_block(chunk_size(12)); SNMALLOC_ASSERT(r_addr == chunk_addr(BASE + 20)); - SNMALLOC_ASSERT(r_size == 12); + UNUSED(r_addr); arena_b.check_invariant(true); printf(" Consolidation after migration: OK\n"); @@ -1226,7 +1235,7 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) owner[j] = my_id; - auto result = arena.add_block(chunk_addr(BASE + start), size); + auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1247,19 +1256,18 @@ namespace snmalloc max_req = 1; size_t n = (rng.next() % max_req) + 1; - auto arena_r = arena.remove_block(n); + auto arena_r = arena.remove_block(chunk_size(n)); auto oracle_r = oracle.remove(n); UNUSED(arena_r); if (oracle_r.second == 0) { - SNMALLOC_ASSERT(arena_r.second == 0); + SNMALLOC_ASSERT(arena_r == 0); } else { - SNMALLOC_ASSERT(arena_r.second != 0); - SNMALLOC_ASSERT(arena_r.first == chunk_addr(BASE + oracle_r.first)); - SNMALLOC_ASSERT(arena_r.second == oracle_r.second); + SNMALLOC_ASSERT(arena_r != 0); + SNMALLOC_ASSERT(arena_r == chunk_addr(BASE + oracle_r.first)); for (size_t j = oracle_r.first; j < oracle_r.first + oracle_r.second; j++) @@ -1284,18 +1292,17 @@ namespace snmalloc UNUSED(src_id); size_t n = (rng.next() % 3) + 1; - auto src_r = src.remove_block(n); + uintptr_t src_r = src.remove_block(chunk_size(n)); auto src_or = src_oracle.remove(n); if (src_or.second == 0) { - SNMALLOC_ASSERT(src_r.second == 0); + SNMALLOC_ASSERT(src_r == 0); } else { - SNMALLOC_ASSERT(src_r.second != 0); - SNMALLOC_ASSERT(src_r.first == chunk_addr(BASE + src_or.first)); - SNMALLOC_ASSERT(src_r.second == src_or.second); + SNMALLOC_ASSERT(src_r != 0); + SNMALLOC_ASSERT(src_r == chunk_addr(BASE + src_or.first)); for (size_t j = src_or.first; j < src_or.first + src_or.second; j++) { @@ -1303,7 +1310,7 @@ namespace snmalloc owner[j] = dst_id; } - auto dst_r = dst.add_block(src_r.first, src_r.second); + auto dst_r = dst.add_block(src_r, chunk_size(src_or.second)); dst_oracle.add(src_or.first, src_or.second); if (dst_r.first != 0) @@ -1358,14 +1365,14 @@ namespace snmalloc MockRep::set_variant(addr, v); } - static size_t get_large_size_chunks(uintptr_t addr) + static size_t get_large_size(uintptr_t addr) { - return MockRep::get_large_size_chunks(addr); + return MockRep::get_large_size(addr); } - static void set_large_size_chunks(uintptr_t addr, size_t s) + static void set_large_size(uintptr_t addr, size_t s) { - MockRep::set_large_size_chunks(addr, s); + MockRep::set_large_size(addr, s); } static bool can_consolidate(uintptr_t higher_addr) @@ -1375,7 +1382,8 @@ namespace snmalloc }; template - using BoundaryArena = Arena; + using BoundaryArena = + Arena; // Test: predecessor merge blocked by boundary. static void test_boundary_blocks_predecessor() @@ -1391,15 +1399,16 @@ namespace snmalloc // Place a boundary at a_addr — blocks should not consolidate leftward. boundary_addrs.insert(a_addr); - arena.add_block(p_addr, 2); - arena.add_block(a_addr, 2); + arena.add_block(p_addr, chunk_size(2)); + arena.add_block(a_addr, chunk_size(2)); // P (chunks 2-3) and A (chunks 4-5) are adjacent but the boundary // at a_addr prevents merging. Both should remain separate. - auto [r1_addr, r1_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r1_addr == p_addr && r1_size == 2); - auto [r2_addr, r2_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r2_addr == a_addr && r2_size == 2); + auto r1_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r1_addr == p_addr); + auto r2_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r2_addr == a_addr); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks predecessor merge: OK\n"); } @@ -1418,15 +1427,16 @@ namespace snmalloc // Place a boundary at s_addr — blocks should not consolidate rightward. boundary_addrs.insert(s_addr); - arena.add_block(s_addr, 4); - arena.add_block(a_addr, 2); + arena.add_block(s_addr, chunk_size(4)); + arena.add_block(a_addr, chunk_size(2)); // A (chunks 2-3) and S (chunks 4-7) are adjacent but the boundary // at s_addr prevents merging. Both should remain separate. - auto [r1_addr, r1_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r1_addr == a_addr && r1_size == 2); - auto [r2_addr, r2_size] = arena.remove_block(4); - SNMALLOC_ASSERT(r2_addr == s_addr && r2_size == 4); + auto r1_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r1_addr == a_addr); + auto r2_addr = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r2_addr == s_addr); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks successor merge: OK\n"); } @@ -1444,16 +1454,17 @@ namespace snmalloc // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. boundary_addrs.insert(chunk_addr(8)); - arena.add_block(chunk_addr(4), 2); - arena.add_block(chunk_addr(8), 2); - arena.add_block(chunk_addr(6), 2); + arena.add_block(chunk_addr(4), chunk_size(2)); + arena.add_block(chunk_addr(8), chunk_size(2)); + arena.add_block(chunk_addr(6), chunk_size(2)); // [4,6) and [6,8) should consolidate to [4,8). // [8,10) should remain separate due to boundary. - auto [r1_addr, r1_size] = arena.remove_block(4); - SNMALLOC_ASSERT(r1_addr == chunk_addr(4) && r1_size == 4); - auto [r2_addr, r2_size] = arena.remove_block(2); - SNMALLOC_ASSERT(r2_addr == chunk_addr(8) && r2_size == 2); + auto r1_addr = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r1_addr == chunk_addr(4)); + auto r2_addr = arena.remove_block(chunk_size(2)); + SNMALLOC_ASSERT(r2_addr == chunk_addr(8)); + UNUSED(r1_addr, r2_addr); printf(" Boundary partial (P merges, S blocked): OK\n"); } @@ -1471,16 +1482,16 @@ namespace snmalloc boundary_addrs.insert(a_addr); - arena.add_block(p_addr, 1); // min-size block - arena.add_block(a_addr, 1); // adjacent, but boundary prevents merge + arena.add_block(p_addr, chunk_size(1)); // min-size block + arena.add_block(a_addr, chunk_size(1)); // adjacent, but boundary prevents merge - auto [r1_addr, r1_size] = arena.remove_block(1); - auto [r2_addr, r2_size] = arena.remove_block(1); + auto r1_addr = arena.remove_block(chunk_size(1)); + auto r2_addr = arena.remove_block(chunk_size(1)); // Both should be separate min-size blocks. - SNMALLOC_ASSERT(r1_size == 1 && r2_size == 1); SNMALLOC_ASSERT( (r1_addr == p_addr && r2_addr == a_addr) || (r1_addr == a_addr && r2_addr == p_addr)); + UNUSED(r1_addr, r2_addr); printf(" Boundary blocks min predecessor merge: OK\n"); } diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc index c432048b9..612eb3cd2 100644 --- a/src/test/func/arenabins/arenabins.cc +++ b/src/test/func/arenabins/arenabins.cc @@ -30,15 +30,16 @@ namespace snmalloc { /** - * Friend struct exposing private internals of `ArenaBins` - * (and its nested `Bitmap`) for unit tests. Forward-declared in - * `arenabins.h`; defined here so the production header - * carries no test-only surface. + * Friend struct exposing private internals of + * `ArenaBins` (and its nested `Bitmap`) + * for unit tests. Forward-declared in `arenabins.h`; + * defined here so the production header carries no test-only + * surface. */ - template + template struct ArenaBinsTestAccess { - using Bins = ArenaBins; + using Bins = ArenaBins; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -73,9 +74,9 @@ namespace snmalloc return Bins::bin_index(block); } - static constexpr size_t max_supported_chunks() + static constexpr size_t max_supported_size() { - return Bins::max_supported_chunks(); + return Bins::max_supported_size(); } // --- Raw size-class id access --- @@ -84,36 +85,37 @@ namespace snmalloc // size class. Production code never names these (the fast path // goes straight from request size to the bitmap-scan / carve // record). Tests cross-check the encoding via the helpers below; - // the alias `chunk_sc_t = size_t` preserves the existing test + // the alias `sc_t = size_t` preserves the existing test // naming. - using chunk_sc_t = size_t; + using sc_t = size_t; - /// Raw id of the smallest size class >= n_chunks. - SNMALLOC_FAST_PATH static chunk_sc_t request(size_t n) + /// Raw id of the smallest size class >= n (n in bytes, + /// multiple of UNIT_SIZE). + SNMALLOC_FAST_PATH static sc_t request(size_t n) { - SNMALLOC_ASSERT(n >= 1); - SNMALLOC_ASSERT(n <= Bins::max_supported_chunks()); - return bits::to_exp_mant(n); + SNMALLOC_ASSERT(n >= (size_t(1) << MIN_SIZE_BITS)); + SNMALLOC_ASSERT(n <= Bins::max_supported_size()); + return bits::to_exp_mant(n); } - static constexpr size_t size_chunks(chunk_sc_t sc) + static constexpr size_t sc_size(sc_t sc) { - return Bins::table_.carve_info[sc].size_chunks; + return Bins::table_.carve_info[sc].size; } - static constexpr size_t align_chunks(chunk_sc_t sc) + static constexpr size_t sc_align(sc_t sc) { - return Bins::table_.carve_info[sc].align_chunks; + return Bins::table_.carve_info[sc].align; } - SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(chunk_sc_t sc) + SNMALLOC_FAST_PATH static const bitmap_info_t& bitmap_info(sc_t sc) { SNMALLOC_ASSERT(sc < Bins::MAX_SC); return Bins::table_.bitmap_info[sc]; } - SNMALLOC_FAST_PATH static const carve_info_t& carve_info(chunk_sc_t sc) + SNMALLOC_FAST_PATH static const carve_info_t& carve_info(sc_t sc) { SNMALLOC_ASSERT(sc < Bins::MAX_SC); return Bins::table_.carve_info[sc]; @@ -125,7 +127,7 @@ namespace snmalloc bitmap_info_for_request_const(size_t n) { return Bins::table_ - .bitmap_info[bits::to_exp_mant_const(n)]; + .bitmap_info[bits::to_exp_mant_const(n)]; } /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). @@ -133,7 +135,7 @@ namespace snmalloc static constexpr const carve_info_t& carve_info_for_request_const(size_t n) { return Bins::table_ - .carve_info[bits::to_exp_mant_const(n)]; + .carve_info[bits::to_exp_mant_const(n)]; } // The canonical source of truth for what each within-exponent bin @@ -197,9 +199,9 @@ using snmalloc::ArenaBinsTestAccess; // to fail the build (not the runtime) if regressed. namespace static_checks { - using B1 = ArenaBinsTestAccess<1>; - using B2 = ArenaBinsTestAccess<2>; - using B3 = ArenaBinsTestAccess<3>; + using B1 = ArenaBinsTestAccess<1, 0>; + using B2 = ArenaBinsTestAccess<2, 0>; + using B3 = ArenaBinsTestAccess<3, 0>; static_assert(B1::BINS_PER_EXP == 2, "B=1 BINS_PER_EXP"); static_assert(B2::BINS_PER_EXP == 5, "B=2 BINS_PER_EXP"); @@ -217,15 +219,15 @@ namespace static_checks // Sizes that are powers of two have align == size. static_assert( - B2::carve_info_for_request_const(4).align_chunks == 4, "size 4 align"); + B2::carve_info_for_request_const(4).align == 4, "size 4 align"); static_assert( - B3::carve_info_for_request_const(8).align_chunks == 8, "size 8 align"); + B3::carve_info_for_request_const(8).align == 8, "size 8 align"); - // size_chunks at request(s) must be >= s. + // sc_size at request(s) must be >= s. static_assert( - B2::carve_info_for_request_const(9).size_chunks == 10, "B=2 round-up"); + B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); static_assert( - B3::carve_info_for_request_const(17).size_chunks == 18, "B=3 round-up"); + B3::carve_info_for_request_const(17).size == 18, "B=3 round-up"); } // namespace static_checks namespace @@ -242,7 +244,7 @@ namespace template constexpr bool serves(size_t bin, size_t n) { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; size_t e_b = bin / Bins::BINS_PER_EXP; size_t o_b = bin % Bins::BINS_PER_EXP; size_t raw = snmalloc::bits::to_exp_mant_const(n); @@ -274,18 +276,18 @@ namespace template void check_chunk_sc_roundtrip() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; // Properties (together these imply request is the smallest size class // with size >= s): - // 1. size_chunks(request(s)) >= s for all s >= 1. - // 2. Idempotence: request(size_chunks(sc)) == sc. + // 1. sc_size(request(s)) >= s for all s >= 1. + // 2. Idempotence: request(sc_size(sc)) == sc. // 3. Monotonicity: s1 <= s2 implies request(s1) <= request(s2). auto prev_sc = Bins::request(1); for (size_t s = 1; s <= 4096; s++) { auto sc = Bins::request(s); - size_t cs = Bins::size_chunks(sc); + size_t cs = Bins::sc_size(sc); if (cs < s) { std::printf( @@ -294,7 +296,7 @@ namespace } if (Bins::request(cs) != sc) { - std::printf("B=%zu request(size_chunks(sc))!=sc for cs=%zu\n", B, cs); + std::printf("B=%zu request(sc_size(sc))!=sc for cs=%zu\n", B, cs); std::abort(); } if (sc < prev_sc) @@ -307,33 +309,33 @@ namespace } template - void check_align_chunks() + void check_sc_align() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { auto sc = Bins::request(s); - size_t cs = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t cs = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); // a must be a power of two. if (a == 0 || (a & (a - 1)) != 0) { - std::printf("B=%zu size %zu: align_chunks %zu not pow2\n", B, cs, a); + std::printf("B=%zu size %zu: sc_align %zu not pow2\n", B, cs, a); std::abort(); } // a must divide cs. if (cs % a != 0) { std::printf( - "B=%zu size %zu: align_chunks %zu does not divide size\n", B, cs, a); + "B=%zu size %zu: sc_align %zu does not divide size\n", B, cs, a); std::abort(); } // a should be the LARGEST power of two dividing cs. if ((a << 1) != 0 && cs % (a << 1) == 0) { std::printf( - "B=%zu size %zu: align_chunks %zu not the largest pow2 divisor\n", + "B=%zu size %zu: sc_align %zu not the largest pow2 divisor\n", B, cs, a); @@ -342,13 +344,13 @@ namespace } } - /// Collect all chunk_sc_t classes whose size fits in the test grid. + /// Collect all sc_t classes whose size fits in the test grid. template - std::vector::chunk_sc_t> + std::vector::sc_t> collect_classes(size_t max_size) { - using Bins = ArenaBinsTestAccess; - using sc_t = typename Bins::chunk_sc_t; + using Bins = ArenaBinsTestAccess; + using sc_t = typename Bins::sc_t; std::vector v; sc_t prev{}; @@ -356,7 +358,7 @@ namespace for (size_t s = 1; s <= max_size; s++) { sc_t sc = Bins::request(s); - if (Bins::size_chunks(sc) != s) + if (Bins::sc_size(sc) != s) continue; // s is not a class size if (!have_prev || sc != prev) { @@ -371,7 +373,7 @@ namespace template void check_bin_classification(size_t max_addr, size_t max_n) { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; auto classes = collect_classes(max_n); for (size_t addr = 0; addr < max_addr; addr++) @@ -382,8 +384,8 @@ namespace for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); bool actually = can_serve(addr, n, s, a); bool predicted = serves(bin, s); @@ -410,7 +412,7 @@ namespace template void check_bin_id_range() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; // bin_index always returns a value in [0, BINS_PER_EXP * (e+1)) for the // block's natural exponent e. @@ -442,7 +444,7 @@ namespace template void check_info_consistency() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t s = 1; s <= 4096; s++) { @@ -452,16 +454,16 @@ namespace // must alias the carve_info(request(s)) record (single table // indirection, no copy). const auto& ci = Bins::carve_info_for_request(s); - if (ci.size_chunks != Bins::size_chunks(sc)) + if (ci.size != Bins::sc_size(sc)) { std::printf( - "B=%zu carve_info_for_request(%zu).size_chunks mismatch\n", B, s); + "B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); std::abort(); } - if (ci.align_chunks != Bins::align_chunks(sc)) + if (ci.align != Bins::sc_align(sc)) { std::printf( - "B=%zu carve_info_for_request(%zu).align_chunks mismatch\n", B, s); + "B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); std::abort(); } if (&ci != &Bins::carve_info(sc)) @@ -489,13 +491,13 @@ namespace } /// to_exp_mant runtime / _const equivalence across a representative - /// range of values, including edges near max_supported_chunks. The + /// range of values, including edges near max_supported_size. The /// runtime variant uses the intrinsic; we cross-check against the /// constexpr reference that's already exercised at compile time. template void check_to_exp_mant_equivalence() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; auto check_one = [&](size_t n) { size_t r = snmalloc::bits::to_exp_mant(n); @@ -517,24 +519,24 @@ namespace size_t pow = size_t(1) << e; if (pow == 0) continue; - if (pow >= 1 && pow <= Bins::max_supported_chunks()) + if (pow >= 1 && pow <= Bins::max_supported_size()) check_one(pow); - if (pow + 1 <= Bins::max_supported_chunks()) + if (pow + 1 <= Bins::max_supported_size()) check_one(pow + 1); if (pow >= 2) check_one(pow - 1); } // The upper boundary itself. - check_one(Bins::max_supported_chunks()); - if (Bins::max_supported_chunks() > 1) - check_one(Bins::max_supported_chunks() - 1); + check_one(Bins::max_supported_size()); + if (Bins::max_supported_size() > 1) + check_one(Bins::max_supported_size() - 1); // A handful of stride values across the full range. - size_t step = Bins::max_supported_chunks() / 257; + size_t step = Bins::max_supported_size() / 257; if (step == 0) step = 1; - for (size_t n = 1; n <= Bins::max_supported_chunks() && n > 0; + for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; n += step + 1) check_one(n); } @@ -544,9 +546,9 @@ namespace /// (defined directly in terms of `bin_subsets`). template size_t reference_find( - size_t n_chunks, const typename ArenaBinsTestAccess::Bitmap& bm) + size_t n_chunks, const typename ArenaBinsTestAccess::Bitmap& bm) { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; for (size_t b = 0; b < Bitmap::TOTAL_BINS; b++) { @@ -561,7 +563,7 @@ namespace template void check_bitmap_smoke() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; if (!Bins::raw_empty(bm)) @@ -584,7 +586,7 @@ namespace std::abort(); } - /// Iterate over every `chunk_sc_t` raw id in `[0, MAX_SC)`. For each + /// Iterate over every `sc_t` raw id in `[0, MAX_SC)`. For each /// one, decode its request size, look up its `bitmap_info_t`, and /// run `body(n_chunks, bitmap_info)`. Multiple raw ids can share the /// same `(start_word, first_mask, second_mask)` triple; callers that @@ -592,7 +594,7 @@ namespace template void for_each_class_info(F body) { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; for (size_t raw = 0; raw < Bins::MAX_SC; raw++) { size_t s = snmalloc::bits::from_exp_mant(raw); @@ -604,7 +606,7 @@ namespace template void check_bitmap_find_empty() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; Bitmap bm; for_each_class_info([&](size_t n, const auto& /*info*/) { @@ -619,7 +621,7 @@ namespace template void check_bitmap_exhaustive_single_bit() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; // Gather a representative set of entries (one per distinct bitmap @@ -672,7 +674,7 @@ namespace template void check_bitmap_multi_bit_random() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -742,7 +744,7 @@ namespace template void check_bitmap_word_boundary() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto check_predicted = @@ -858,7 +860,7 @@ namespace template void check_bitmap_bin_index_integration() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; auto classes = collect_classes(64); @@ -871,8 +873,8 @@ namespace Bins::raw_set(bm, bin); for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); bool actually = can_serve(addr, n, s, a); size_t got = bm.find_for_request(s); size_t want = actually ? bin : size_t(SIZE_MAX); @@ -903,7 +905,7 @@ namespace template void check_bitmap_add() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; using range_t = typename Bins::range_t; @@ -977,7 +979,7 @@ namespace template void check_bitmap_find_min() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using Bitmap = typename Bins::Bitmap; struct Entry @@ -1045,11 +1047,11 @@ namespace } /// Verify carve(): pre.base+pre.size == req.base; req.base aligned; - /// req.size == sc.size_chunks; post.base == req.end; spans equal. + /// req.size == n; post.base == req.end; spans equal. template void check_carve() { - using Bins = ArenaBinsTestAccess; + using Bins = ArenaBinsTestAccess; using range_t = typename Bins::range_t; auto classes = collect_classes(64); @@ -1059,64 +1061,76 @@ namespace { for (auto sc : classes) { - size_t s = Bins::size_chunks(sc); - size_t a = Bins::align_chunks(sc); + size_t s = Bins::sc_size(sc); + size_t a = Bins::sc_align(sc); if (!can_serve(addr, n, s, a)) continue; - auto cv = Bins::carve(range_t{addr, n}, s); - - // pre starts at the block's base. - if (cv.pre.base != addr) - { - std::printf( - "B=%zu carve pre.base != addr (addr=%zu n=%zu s=%zu)\n", - B, - addr, - n, - s); - std::abort(); - } - // pre.end == req.base. - if (cv.pre.base + cv.pre.size != cv.req.base) - { - std::printf("B=%zu carve pre.end != req.base\n", B); - std::abort(); - } - // req aligned. - if ((cv.req.base & (a - 1)) != 0) - { - std::printf( - "B=%zu carve req.base %zu not aligned to %zu\n", - B, - cv.req.base, - a); - std::abort(); - } - // req.size == sc.size_chunks. - if (cv.req.size != s) - { - std::printf( - "B=%zu carve req.size %zu != s %zu\n", B, cv.req.size, s); - std::abort(); - } - // req.end == post.base. - if (cv.req.base + cv.req.size != cv.post.base) - { - std::printf("B=%zu carve req.end != post.base\n", B); - std::abort(); - } - // post.end == block.end. - if (cv.post.base + cv.post.size != addr + n) + // Exercise both the trivial case (request == SC size) and + // the non-trivial case (request strictly less than SC size, + // which forces the rounding remainder into `post`). The SC + // for `r` must be `sc` itself so the alignment used by carve + // matches what `can_serve` checked. + for (size_t r = 1; r <= s; r++) { - std::printf("B=%zu carve post.end != block.end\n", B); - std::abort(); - } - // pre.size + req.size + post.size == block.size. - if (cv.pre.size + cv.req.size + cv.post.size != n) - { - std::printf("B=%zu carve sizes don't sum to n\n", B); - std::abort(); + if (Bins::sc_size(Bins::request(r)) != s) + continue; + + auto cv = Bins::carve(range_t{addr, n}, r); + + // pre starts at the block's base. + if (cv.pre.base != addr) + { + std::printf( + "B=%zu carve pre.base != addr (addr=%zu n=%zu r=%zu s=%zu)\n", + B, + addr, + n, + r, + s); + std::abort(); + } + // pre.end == req.base. + if (cv.pre.base + cv.pre.size != cv.req.base) + { + std::printf("B=%zu carve pre.end != req.base\n", B); + std::abort(); + } + // req aligned to the SC's natural alignment. + if ((cv.req.base & (a - 1)) != 0) + { + std::printf( + "B=%zu carve req.base %zu not aligned to %zu\n", + B, + cv.req.base, + a); + std::abort(); + } + // req.size == requested n_chunks (carve-exact). + if (cv.req.size != r) + { + std::printf( + "B=%zu carve req.size %zu != r %zu\n", B, cv.req.size, r); + std::abort(); + } + // req.end == post.base. + if (cv.req.base + cv.req.size != cv.post.base) + { + std::printf("B=%zu carve req.end != post.base\n", B); + std::abort(); + } + // post.end == block.end. + if (cv.post.base + cv.post.size != addr + n) + { + std::printf("B=%zu carve post.end != block.end\n", B); + std::abort(); + } + // pre.size + req.size + post.size == block.size. + if (cv.pre.size + cv.req.size + cv.post.size != n) + { + std::printf("B=%zu carve sizes don't sum to n\n", B); + std::abort(); + } } } } @@ -1128,9 +1142,9 @@ namespace { std::printf("--- Running ArenaBinsTestAccess<%zu> tests ---\n", B); check_chunk_sc_roundtrip(); - std::printf(" chunk_sc_t round-trip: OK\n"); - check_align_chunks(); - std::printf(" align_chunks: OK\n"); + std::printf(" sc_t round-trip: OK\n"); + check_sc_align(); + std::printf(" sc_align: OK\n"); check_to_exp_mant_equivalence(); std::printf(" to_exp_mant runtime/_const equivalence: OK\n"); check_info_consistency(); @@ -1163,45 +1177,176 @@ namespace /// catch silent breakage of the canonical numbering. void check_known_values() { - using B2 = ArenaBinsTestAccess<2>; + using B2 = ArenaBinsTestAccess<2, 0>; // size 1 -> raw 0, size 2 -> raw 1, size 3 -> raw 2, size 4 -> raw 3, // size 5 -> raw 4, ..., size 8 -> raw 7, size 10 -> raw 8. - if (B2::size_chunks(B2::request(1)) != 1) + if (B2::sc_size(B2::request(1)) != 1) std::abort(); - if (B2::size_chunks(B2::request(8)) != 8) + if (B2::sc_size(B2::request(8)) != 8) std::abort(); - if (B2::size_chunks(B2::request(9)) != 10) + if (B2::sc_size(B2::request(9)) != 10) std::abort(); - if (B2::size_chunks(B2::request(11)) != 12) + if (B2::sc_size(B2::request(11)) != 12) std::abort(); - // align_chunks: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, + // sc_align: size 4 -> 4, size 5 -> 1, size 6 -> 2, size 8 -> 8, // size 10 -> 2, size 12 -> 4, size 14 -> 2. - if (B2::align_chunks(B2::request(4)) != 4) + if (B2::sc_align(B2::request(4)) != 4) std::abort(); - if (B2::align_chunks(B2::request(5)) != 1) + if (B2::sc_align(B2::request(5)) != 1) std::abort(); - if (B2::align_chunks(B2::request(6)) != 2) + if (B2::sc_align(B2::request(6)) != 2) std::abort(); - if (B2::align_chunks(B2::request(8)) != 8) + if (B2::sc_align(B2::request(8)) != 8) std::abort(); - if (B2::align_chunks(B2::request(10)) != 2) + if (B2::sc_align(B2::request(10)) != 2) std::abort(); // BINS_PER_EXP must be 5 for B=2. if (B2::BINS_PER_EXP != 5) std::abort(); - using B3 = ArenaBinsTestAccess<3>; + using B3 = ArenaBinsTestAccess<3, 0>; if (B3::BINS_PER_EXP != 13) std::abort(); - using B1 = ArenaBinsTestAccess<1>; + using B1 = ArenaBinsTestAccess<1, 0>; if (B1::BINS_PER_EXP != 2) std::abort(); } + + /** + * Verify that scaling the encoding by `UNIT_SIZE = 1 << MIN_SIZE_BITS` + * is a structural equivalence: every public observation about a + * `ArenaBins` instance equals the + * corresponding observation on `ArenaBins` when the + * input is scaled by `UNIT_SIZE` (and outputs, where they are sizes + * or addresses, are also scaled by `UNIT_SIZE`). + * + * This pins the new template parameter to act purely as a unit + * change, with no other semantic effect on the bin scheme. + */ + template + void check_min_size_bits_equivalence() + { + using Scaled = ArenaBinsTestAccess; + using Base = ArenaBinsTestAccess; + static_assert(MIN_SIZE_BITS > 0, "this check is for MIN_SIZE_BITS > 0"); + constexpr size_t U = size_t(1) << MIN_SIZE_BITS; + + // BINS_PER_EXP is independent of MIN_SIZE_BITS. + if (Scaled::BINS_PER_EXP != Base::BINS_PER_EXP) + std::abort(); + if (Scaled::MANTISSAS_PER_EXP != Base::MANTISSAS_PER_EXP) + std::abort(); + + // request(n*U) at MIN_SIZE_BITS==K returns the same raw id as + // request(n) at MIN_SIZE_BITS==0; sc_size(raw) at MIN_SIZE_BITS==K + // equals sc_size(raw) at MIN_SIZE_BITS==0 times U; sc_align + // likewise. + size_t probe[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; + for (size_t n : probe) + { + // Skip values that would overflow either instance's domain. + if (n > Base::max_supported_size()) + continue; + if (n > Scaled::max_supported_size() / U) + continue; + auto sc_base = Base::request(n); + auto sc_scaled = Scaled::request(n * U); + if (sc_base != sc_scaled) + std::abort(); + if (Scaled::sc_size(sc_scaled) != Base::sc_size(sc_base) * U) + std::abort(); + if (Scaled::sc_align(sc_scaled) != Base::sc_align(sc_base) * U) + std::abort(); + } + + // bin_index({a*U, n*U}) at MIN_SIZE_BITS==K matches bin_index({a, n}) + // at MIN_SIZE_BITS==0. + using ScaledR = typename Scaled::range_t; + using BaseR = typename Base::range_t; + for (size_t n = 1; n <= 64; n++) + for (size_t a = 0; a < 32; a++) + if (Scaled::bin_index(ScaledR{a * U, n * U}) != + Base::bin_index(BaseR{a, n})) + std::abort(); + + // carve({0, blk*U}, n*U) returns the same partition as + // carve({0, blk}, n) at MIN_SIZE_BITS==0, scaled by U. + for (size_t blk = 1; blk <= 32; blk++) + for (size_t n = 1; n <= blk; n++) + { + // carve's precondition (servability) is that the SC for `n` + // fits inside `blk` after alignment. With base 0, pad is 0, + // so the condition reduces to `Base::sc_size(Base::request(n)) + // <= blk`. Skip pairs that don't satisfy it. + if (Base::sc_size(Base::request(n)) > blk) + continue; + auto base_cv = Base::carve(BaseR{0, blk}, n); + auto scaled_cv = Scaled::carve(ScaledR{0, blk * U}, n * U); + if (scaled_cv.pre.base != base_cv.pre.base * U || + scaled_cv.pre.size != base_cv.pre.size * U) + std::abort(); + if (scaled_cv.req.base != base_cv.req.base * U || + scaled_cv.req.size != base_cv.req.size * U) + std::abort(); + if (scaled_cv.post.base != base_cv.post.base * U || + scaled_cv.post.size != base_cv.post.size * U) + std::abort(); + } + + // Bitmap find_for_request scales: an arena populated by add + // returns the same bin id, and `find_for_request(n*U)` agrees + // with `find_for_request(n)` at MIN_SIZE_BITS==0. + typename Scaled::Bitmap bm_scaled{}; + typename Base::Bitmap bm_base{}; + // Populate with a handful of representative ranges. + size_t pop[][2] = {{0, 4}, {16, 1}, {17, 7}, {64, 9}, {128, 64}}; + for (auto& p : pop) + { + size_t a = p[0], s = p[1]; + auto id_b = bm_base.add(BaseR{a, s}); + auto id_s = bm_scaled.add(ScaledR{a * U, s * U}); + if (id_b != id_s) + std::abort(); + } + for (size_t n = 1; n <= 32; n++) + { + auto f_b = bm_base.find_for_request(n); + auto f_s = bm_scaled.find_for_request(n * U); + if (f_b != f_s) + std::abort(); + } + } + + /// Concrete expected values at MIN_SIZE_BITS == 4 to pin the + /// interpretation: bin 0 corresponds to the unit-size block, + /// raw 0 decodes to UNIT_SIZE bytes, etc. + void check_known_values_unit_16() + { + using BU = ArenaBinsTestAccess<2, 4>; + constexpr size_t U = size_t(1) << 4; + + // size U (UNIT_SIZE) -> raw 0; size 2U -> raw 1; ... + if (BU::sc_size(BU::request(U)) != U) + std::abort(); + if (BU::sc_size(BU::request(8 * U)) != 8 * U) + std::abort(); + // size 9U requires SC for 10U at B=2 (round up). + if (BU::sc_size(BU::request(9 * U)) != 10 * U) + std::abort(); + if (BU::sc_align(BU::request(4 * U)) != 4 * U) + std::abort(); + if (BU::sc_align(BU::request(8 * U)) != 8 * U) + std::abort(); + + // Bin 0 corresponds to a UNIT_SIZE block. + if (BU::bin_index({0, U}) != 0) + std::abort(); + } } // namespace int main(int, char**) @@ -1211,6 +1356,15 @@ int main(int, char**) check_known_values(); std::printf("Known concrete values: OK\n"); + check_known_values_unit_16(); + std::printf("Known concrete values at MIN_SIZE_BITS=4: OK\n"); + + check_min_size_bits_equivalence<1, 4>(); + check_min_size_bits_equivalence<2, 4>(); + check_min_size_bits_equivalence<3, 4>(); + check_min_size_bits_equivalence<2, 14>(); + std::printf("MIN_SIZE_BITS equivalence: OK\n"); + run_all<1>(); run_all<2>(); run_all<3>(); diff --git a/src/test/func/largearenarange/largearenarange.cc b/src/test/func/largearenarange/largearenarange.cc new file mode 100644 index 000000000..94a6e360a --- /dev/null +++ b/src/test/func/largearenarange/largearenarange.cc @@ -0,0 +1,316 @@ +/** + * Unit tests for LargeArenaRange and PagemapRep. + * + * Tests the Range wrapper around Arena using a real pagemap, + * exercising alloc_range, dealloc_range, refill, and overflow paths. + */ + +#include "test/setup.h" + +#include + +#ifndef SNMALLOC_TRACING +# define SNMALLOC_TRACING +#endif +#include "test/snmalloc_testlib.h" + +#include +#include + +namespace +{ + using namespace snmalloc; + + // --- Test pagemap and range types --- + + using Pal = DefaultPal; + using PagemapEntry = DefaultPagemapEntry; + using ConcretePagemap = FlatPagemap; + using TestPagemap = BasicPagemap; + + // Initialise the pagemap once before tests. + static bool pagemap_initialised = false; + + static void ensure_pagemap() + { + if (!pagemap_initialised) + { + TestPagemap::concretePagemap.template init(); + pagemap_initialised = true; + } + } + + // Simple parent: PalRange + PagemapRegisterRange. + using ParentSource = Pipe, PagemapRegisterRange>; + + // LargeArenaRange under test: global range (MAX_SIZE_BITS = BITS - 1). + // This means overflow dealloc never goes to parent (matches the global + // range configuration). MIN_REFILL_BITS = MinBaseSizeBits() so + // the first parent allocation is at least the PAL's minimum reserve + // size — Windows VirtualAlloc cannot reserve below its allocation + // granularity (64 KiB) and PalRange returns nullptr in that case. + static constexpr size_t REFILL_BITS = 20; + static constexpr size_t MAX_BITS = bits::BITS - 1; + static constexpr size_t MIN_REFILL_BITS = MinBaseSizeBits(); + + using ArenaRange = Pipe< + ParentSource, + LargeArenaRange>; + + // --- Tests --- + + static void test_basic_alloc_dealloc() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate a single chunk. + auto p1 = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p1 != nullptr); + printf(" alloc %zu bytes at %p\n", MIN_CHUNK_SIZE, p1.unsafe_ptr()); + + // Deallocate and re-allocate — should succeed. + range.dealloc_range(p1, MIN_CHUNK_SIZE); + auto p2 = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p2 != nullptr); + + // Clean up. + range.dealloc_range(p2, MIN_CHUNK_SIZE); + + printf(" Basic alloc/dealloc: OK\n"); + } + + static void test_multiple_sizes() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate various power-of-two sizes. + constexpr size_t NUM_SIZES = 6; + size_t sizes[NUM_SIZES] = { + MIN_CHUNK_SIZE, + MIN_CHUNK_SIZE * 2, + MIN_CHUNK_SIZE * 4, + MIN_CHUNK_SIZE * 8, + MIN_CHUNK_SIZE * 16, + MIN_CHUNK_SIZE * 32}; + capptr::Arena ptrs[NUM_SIZES] = {}; + + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Deallocate all. + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + printf(" Multiple sizes: OK\n"); + } + + static void test_refill() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate more than one refill's worth of chunks. + // REFILL_SIZE is 2^20, MIN_CHUNK_SIZE is 2^14, + // so one refill is ~64 chunks. + constexpr size_t NUM_ALLOCS = 200; + capptr::Arena ptrs[NUM_ALLOCS] = {}; + + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Deallocate all. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + // Re-allocate — should serve from freed blocks, no new refill needed + // for the first pass. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // Final cleanup. + for (size_t i = 0; i < NUM_ALLOCS; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + printf(" Refill (200 allocs): OK\n"); + } + + static void test_alloc_dealloc_cycle() + { + ensure_pagemap(); + ArenaRange range{}; + + // Interleave alloc and dealloc to exercise consolidation. + constexpr size_t ROUNDS = 100; + for (size_t r = 0; r < ROUNDS; r++) + { + auto p = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(p != nullptr); + range.dealloc_range(p, MIN_CHUNK_SIZE); + } + + // Do a larger allocation after many cycles — verifies + // that consolidation is working (freed chunks merge back). + auto large = range.alloc_range(MIN_CHUNK_SIZE * 4); + SNMALLOC_ASSERT(large != nullptr); + range.dealloc_range(large, MIN_CHUNK_SIZE * 4); + + printf(" Alloc/dealloc cycle: OK\n"); + } + + static void test_alignment() + { + ensure_pagemap(); + ArenaRange range{}; + + // Verify that returned pointers are properly aligned. + constexpr size_t NUM_TESTS = 5; + size_t sizes[NUM_TESTS] = { + MIN_CHUNK_SIZE, + MIN_CHUNK_SIZE * 2, + MIN_CHUNK_SIZE * 4, + MIN_CHUNK_SIZE * 8, + MIN_CHUNK_SIZE * 16}; + + for (size_t i = 0; i < NUM_TESTS; i++) + { + auto p = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(p != nullptr); + uintptr_t addr = p.unsafe_uintptr(); + SNMALLOC_ASSERT( + (addr & (sizes[i] - 1)) == 0 && "Allocation not properly aligned"); + UNUSED(addr); + range.dealloc_range(p, sizes[i]); + } + + printf(" Alignment: OK\n"); + } + + static void test_large_then_small() + { + ensure_pagemap(); + ArenaRange range{}; + + // Allocate a large block, dealloc, then allocate smaller blocks + // from the same space. + size_t large_size = MIN_CHUNK_SIZE * 16; + auto large = range.alloc_range(large_size); + SNMALLOC_ASSERT(large != nullptr); + range.dealloc_range(large, large_size); + + // Now allocate 16 individual chunks — should come from the freed + // large block's space. + constexpr size_t N = 16; + capptr::Arena ptrs[N] = {}; + for (size_t i = 0; i < N; i++) + { + ptrs[i] = range.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + for (size_t i = 0; i < N; i++) + { + range.dealloc_range(ptrs[i], MIN_CHUNK_SIZE); + } + + printf(" Large then small: OK\n"); + } + + static void test_non_pow2_sizes() + { + ensure_pagemap(); + ArenaRange range{}; + + // Non-power-of-two, chunk-multiple sizes. Some of these are not + // representable size-classes (e.g. 9, 11, 13 chunks); the arena + // carves exactly the requested chunk count and rolls the rounding + // remainder into the post fragment, so callers see no excess. + constexpr size_t NUM_SIZES = 8; + size_t sizes[NUM_SIZES] = { + MIN_CHUNK_SIZE * 3, + MIN_CHUNK_SIZE * 5, + MIN_CHUNK_SIZE * 6, + MIN_CHUNK_SIZE * 7, + MIN_CHUNK_SIZE * 9, + MIN_CHUNK_SIZE * 11, + MIN_CHUNK_SIZE * 13, + MIN_CHUNK_SIZE * 17}; + + capptr::Arena ptrs[NUM_SIZES] = {}; + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + + // All pointers must be distinct and non-overlapping (within the size + // requested — over-allocation would break this because the rounding + // remainder would later be handed out a second time). + for (size_t i = 0; i < NUM_SIZES; i++) + { + uintptr_t lo_i = ptrs[i].unsafe_uintptr(); + uintptr_t hi_i = lo_i + sizes[i]; + for (size_t j = i + 1; j < NUM_SIZES; j++) + { + uintptr_t lo_j = ptrs[j].unsafe_uintptr(); + uintptr_t hi_j = lo_j + sizes[j]; + SNMALLOC_ASSERT(hi_i <= lo_j || hi_j <= lo_i); + UNUSED(hi_i, hi_j); + } + } + + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + // After deallocating all, repeat the exact same pattern to confirm + // the freed space is reusable (catches leaks from un-returned + // rounding remainder). + for (size_t i = 0; i < NUM_SIZES; i++) + { + ptrs[i] = range.alloc_range(sizes[i]); + SNMALLOC_ASSERT(ptrs[i] != nullptr); + } + for (size_t i = 0; i < NUM_SIZES; i++) + { + range.dealloc_range(ptrs[i], sizes[i]); + } + + printf(" Non-pow2 sizes: OK\n"); + } +} // anonymous namespace + +int main() +{ + setup(); + + printf("--- LargeArenaRange tests ---\n"); + + test_basic_alloc_dealloc(); + test_multiple_sizes(); + test_refill(); + test_alloc_dealloc_cycle(); + test_alignment(); + test_large_then_small(); + test_non_pow2_sizes(); + + printf("All LargeArenaRange tests passed.\n"); + return 0; +} From 9fd322a55d1a9ee386f9d333545c8d39717da4bf Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Sat, 23 May 2026 10:42:52 +0100 Subject: [PATCH 08/15] Substitute LargeBuddyRange with LargeArenaRange in default range pipelines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mechanical substitution of every LargeBuddyRange instantiation in the default in-tree range pipelines: - src/snmalloc/backend/standard_range.h (GlobalR, LargeObjectRange). - src/snmalloc/backend/meta_protected_range.h (GlobalR, CentralObjectRange, CentralMetaRange, the conditional_t huge-page cache, ObjectRange, MetaRange). After this change snmalloc uses the Arena bin-tree allocator instead of the power-of-two buddy for all large-range management in the default pipelines. LargeBuddyRange and BuddyChunkRep remain in the tree, available for alternative configurations. Two issues uncovered during testing and fixed here: 1. arena.h: Arena::add_block's successor-min branch called Rep::can_consolidate(succ_addr) before contains_min(succ_addr) confirmed succ_addr is in our region. For a block added at the very top of a registered region (e.g. last 8 MiB of a 256 MiB fixed region), succ_addr = addr + size sits one chunk past the pagemap's mapped backing, and the can_consolidate probe segfaults. The fix reorders the checks so the tree-membership test gates the pagemap read, matching the documented pattern in buddy.h:90-93. Regression coverage: MockRep gains a per-chunk `boundary` field on `mock_entry`. `MockRep::can_consolidate(addr)` now returns `!mock_store[mock_index(addr)].boundary` — faithful to the real `PagemapRep::can_consolidate` reading `entry.is_boundary()`. The `mock_index` bounds assertion fires on any out-of-range probe, so the unsafe pattern trips in unit tests rather than only as a segfault in production. A new test_block_at_arena_top_edge adds a block whose succ_addr would address chunk MOCK_ARENA_CHUNKS; without the reorder this reproduces the original failure. This unification also subsumed the previous BoundaryMockRep and its boundary_addrs global std::set: the four boundary tests now run on Arena and set mock_store[mock_index(addr)].boundary = true instead. Net -35 lines in arena.cc. 2. arenabins.h: the BinTable constexpr constructor used throw "..." as a constexpr-eval-fails trick to surface invariant violations as compile errors. throw requires exception support, which is disabled in the main allocator (-fno-exceptions), so this broke builds. Replaced with SNMALLOC_CHECK(false && "..."), which calls a non-constexpr error path and achieves the same compile-time failure without runtime exception machinery. Full ctest suite passes (86/86, --timeout 120 -j 4). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/meta_protected_range.h | 12 +- src/snmalloc/backend/standard_range.h | 4 +- src/snmalloc/backend_helpers/arena.h | 9 +- src/snmalloc/backend_helpers/arenabins.h | 10 +- src/test/func/arena/arena.cc | 125 +++++++++++--------- 5 files changed, 88 insertions(+), 72 deletions(-) diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index 857e853d2..9b4ca756b 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -32,7 +32,7 @@ namespace snmalloc // Global range of memory using GlobalR = Pipe< Base, - LargeBuddyRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -51,7 +51,7 @@ namespace snmalloc // would be able to corrupt meta-data. using CentralObjectRange = Pipe< GlobalR, - LargeBuddyRange, + LargeArenaRange, LogRange<3>, GlobalRange, CommitRange, @@ -67,7 +67,7 @@ namespace snmalloc GlobalR, SubRange, // Use SubRange to introduce guard // pages. - LargeBuddyRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -77,7 +77,7 @@ namespace snmalloc // page, so commit in the global range. stl::conditional_t< (max_page_chunk_size_bits > MIN_CHUNK_BITS), - LargeBuddyRange< + LargeArenaRange< max_page_chunk_size_bits, max_page_chunk_size_bits, Pagemap, @@ -90,7 +90,7 @@ namespace snmalloc // Local caching of object range using ObjectRange = Pipe< CentralObjectRange, - LargeBuddyRange< + LargeArenaRange< LocalCacheSizeBits, LocalCacheSizeBits, Pagemap, @@ -100,7 +100,7 @@ namespace snmalloc // Local caching of meta-data range using MetaRange = Pipe< CentralMetaRange, - LargeBuddyRange< + LargeArenaRange< LocalCacheSizeBits - SubRangeRatioBits, bits::BITS - 1, Pagemap>, diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 78609ed2d..b67a386d9 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -29,7 +29,7 @@ namespace snmalloc // Global range of memory, expose this so can be filled by init. using GlobalR = Pipe< Base, - LargeBuddyRange< + LargeArenaRange< GlobalCacheSizeBits, bits::BITS - 1, Pagemap, @@ -49,7 +49,7 @@ namespace snmalloc // Use buddy allocators to cache locally. using LargeObjectRange = Pipe< Stats, - StaticConditionalRange addr && Rep::can_consolidate(succ_addr) && - contains_min(succ_addr)) + succ_addr > addr && contains_min(succ_addr) && + Rep::can_consolidate(succ_addr)) merge(succ_addr, UNIT_SIZE); // Arena-scale overflow: consolidated block spans the full arena. diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index fdce2b143..19551481e 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -801,11 +801,13 @@ namespace snmalloc // If this fires, `bin_subsets` violates the strict-chain // invariant: candidate `b`'s subset does not properly // contain candidate `b_next`'s, so the cascade can't be - // expressed as single-mantissa probes. `throw` makes the - // constexpr evaluation non-constant and surfaces the - // violation as a compile error. + // expressed as single-mantissa probes. Calling the + // non-constexpr `SNMALLOC_CHECK` makes the constexpr + // evaluation non-constant and surfaces the violation as + // a compile error. if (discrim_set == 0) - throw "bin_subsets violates strict-chain invariant"; + SNMALLOC_CHECK_MSG( + false, "bin_subsets violates strict-chain invariant"); cascade_steps[m_top][i].m_test = bits::ctz_const(discrim_set); cascade_steps[m_top][i].bin = b; } diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 291c35377..6d6a18a21 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -64,7 +64,9 @@ namespace snmalloc // Each chunk-aligned address maps to a mock_entry via its chunk index. // word1/word2 hold bin-tree children; range_word1/range_word2 hold - // range-tree children. variant and large_size hold metadata. + // range-tree children. variant and large_size hold metadata. boundary + // mirrors the real PagemapRep's entry.is_boundary() — set it on a + // chunk to suppress consolidation across that chunk. struct mock_entry { uintptr_t word1{0}; @@ -73,6 +75,7 @@ namespace snmalloc uintptr_t range_word2{0}; ArenaVariant variant{ArenaVariant::Min}; size_t large_size{0}; + bool boundary{false}; }; // Size the array for the largest test arena + trailing room. @@ -197,9 +200,19 @@ namespace snmalloc mock_store[mock_index(addr)].large_size = s; } - static bool can_consolidate(uintptr_t) + // Mirrors PagemapRep::can_consolidate, which reads + // entry.is_boundary() from the pagemap. The boundary flag lives + // per-chunk in mock_store. An out-of-region probe returns false + // (cannot consolidate) — both because that is the right semantic + // (no neighbour exists outside the arena) and because it gives + // GCC's release-mode `-Warray-bounds` analysis a visible guard + // covering the `mock_store[...]` read on this branch. + static bool can_consolidate(uintptr_t addr) { - return true; + size_t idx = addr >> MIN_CHUNK_BITS; + if (idx >= MOCK_ARENA_CHUNKS) + return false; + return !mock_store[idx].boundary; } }; @@ -394,7 +407,8 @@ namespace snmalloc for (auto& b : blocks) { - auto result = arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); + auto result = + arena.add_block(chunk_addr(b.chunk_idx), chunk_size(b.size)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -481,7 +495,8 @@ namespace snmalloc static void add_and_check(ArenaT& arena, size_t chunk_idx, size_t size_in_chunks) { - auto result = arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); + auto result = + arena.add_block(chunk_addr(chunk_idx), chunk_size(size_in_chunks)); SNMALLOC_ASSERT(result.first == 0 && result.second == 0); UNUSED(result); arena.check_invariant(true); @@ -1032,7 +1047,8 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) allocated[j] = false; - auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); + auto result = + arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1235,7 +1251,8 @@ namespace snmalloc for (size_t j = start; j < start + size; j++) owner[j] = my_id; - auto result = arena.add_block(chunk_addr(BASE + start), chunk_size(size)); + auto result = + arena.add_block(chunk_addr(BASE + start), chunk_size(size)); oracle.add(start, size); if (result.first != 0) @@ -1346,58 +1363,23 @@ namespace snmalloc // ================================================================== // (J) Boundary consolidation prevention // ================================================================== - - // A Rep variant that blocks consolidation at specific addresses. - static std::set boundary_addrs; - - struct BoundaryMockRep - { - using BinRep = MockRep::BinRep; - using RangeRep = MockRep::RangeRep; - - static ArenaVariant get_variant(uintptr_t addr) - { - return MockRep::get_variant(addr); - } - - static void set_variant(uintptr_t addr, ArenaVariant v) - { - MockRep::set_variant(addr, v); - } - - static size_t get_large_size(uintptr_t addr) - { - return MockRep::get_large_size(addr); - } - - static void set_large_size(uintptr_t addr, size_t s) - { - MockRep::set_large_size(addr, s); - } - - static bool can_consolidate(uintptr_t higher_addr) - { - return boundary_addrs.find(higher_addr) == boundary_addrs.end(); - } - }; - - template - using BoundaryArena = - Arena; + // + // The boundary field on mock_entry suppresses consolidation across + // that chunk; MockRep::can_consolidate reads it. This mirrors the + // real PagemapRep::can_consolidate reading entry.is_boundary(). // Test: predecessor merge blocked by boundary. static void test_boundary_blocks_predecessor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + TestArena arena; uintptr_t p_addr = chunk_addr(2); uintptr_t a_addr = chunk_addr(4); // Place a boundary at a_addr — blocks should not consolidate leftward. - boundary_addrs.insert(a_addr); + mock_store[mock_index(a_addr)].boundary = true; arena.add_block(p_addr, chunk_size(2)); arena.add_block(a_addr, chunk_size(2)); @@ -1417,15 +1399,14 @@ namespace snmalloc static void test_boundary_blocks_successor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + TestArena arena; uintptr_t a_addr = chunk_addr(2); uintptr_t s_addr = chunk_addr(4); // Place a boundary at s_addr — blocks should not consolidate rightward. - boundary_addrs.insert(s_addr); + mock_store[mock_index(s_addr)].boundary = true; arena.add_block(s_addr, chunk_size(4)); arena.add_block(a_addr, chunk_size(2)); @@ -1445,14 +1426,13 @@ namespace snmalloc static void test_boundary_partial() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + TestArena arena; // Three adjacent blocks: chunks [4,6), [6,8), [8,10). // Boundary at chunk 8 blocks [6,8) ↔ [8,10) merge but allows // [4,6) ↔ [6,8) merge into a 4-aligned block at chunk 4. - boundary_addrs.insert(chunk_addr(8)); + mock_store[mock_index(chunk_addr(8))].boundary = true; arena.add_block(chunk_addr(4), chunk_size(2)); arena.add_block(chunk_addr(8), chunk_size(2)); @@ -1469,21 +1449,49 @@ namespace snmalloc printf(" Boundary partial (P merges, S blocked): OK\n"); } + // Regression test: a block whose successor address sits one past + // the arena's pagemap must not trigger a can_consolidate probe of + // that out-of-range chunk. The fix is in Arena::add_block — + // tree-membership tests gate the can_consolidate read. MockRep's + // can_consolidate now dereferences mock_store via mock_index, which + // asserts on out-of-range indices, so an unguarded probe in + // add_block trips here rather than only as a segfault in production + // builds. + static void test_block_at_arena_top_edge() + { + reset_mock_store(); + constexpr size_t K = 10; + TestArena arena; + constexpr size_t ARENA_CHUNKS = size_t{1} << K; + + // Block ending at the very top of the arena (succ_addr would + // address chunk ARENA_CHUNKS, one past mock_store). + uintptr_t top_addr = chunk_addr(ARENA_CHUNKS - 4); + arena.add_block(top_addr, chunk_size(4)); + arena.check_invariant(true); + + auto r1 = arena.remove_block(chunk_size(4)); + SNMALLOC_ASSERT(r1 == top_addr); + UNUSED(r1); + + printf(" Block at arena top edge: OK\n"); + } + // Test: min-size predecessor blocked by boundary. static void test_boundary_blocks_min_predecessor() { reset_mock_store(); - boundary_addrs.clear(); constexpr size_t K = 6; - BoundaryArena arena; + TestArena arena; uintptr_t p_addr = chunk_addr(4); uintptr_t a_addr = chunk_addr(5); - boundary_addrs.insert(a_addr); + mock_store[mock_index(a_addr)].boundary = true; arena.add_block(p_addr, chunk_size(1)); // min-size block - arena.add_block(a_addr, chunk_size(1)); // adjacent, but boundary prevents merge + arena.add_block( + a_addr, chunk_size(1)); // adjacent, but boundary prevents merge auto r1_addr = arena.remove_block(chunk_size(1)); auto r2_addr = arena.remove_block(chunk_size(1)); @@ -1555,6 +1563,7 @@ int main() snmalloc::test_boundary_blocks_predecessor(); snmalloc::test_boundary_blocks_successor(); snmalloc::test_boundary_partial(); + snmalloc::test_block_at_arena_top_edge(); snmalloc::test_boundary_blocks_min_predecessor(); printf("All Arena tests passed.\n"); From 11bc88fc72155cc8da93926f6da9aaf3545d819f Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 25 May 2026 14:43:34 +0100 Subject: [PATCH 09/15] Uniform sizeclass encoding for small and large allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the tagged small/large encoding and the leading-zero-count large-class indexing with a single uniform exp+mantissa scheme: value == 0 : unmapped sentinel value in [1, 1 + NUM_SMALL_SIZECLASSES) : small (sc = value - 1) value in [1 + NUM_SMALL_SIZECLASSES, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) : large (lc = ...) Small classes use `from_exp_mant(sc)` (unchanged). Large classes continue the same exp+mantissa namespace as `from_exp_mant(NUM_SMALL_SIZECLASSES + lc)`. The discriminator tag bit is gone — small and large share one contiguous index space — and the sentinel slot 0 lets the size-lookup fast path return 0 / 0 for unmapped pointers without a branch. The `SIZECLASS_REP_SIZE` / `REMOTE_BACKEND_MARKER` / `REMOTE_MIN_ALIGN` chain is re-derived from the new `SIZECLASS_BITS` (renamed from `TAG_SIZECLASS_BITS`); RED_BIT / VARIANT_SHIFT / LARGE_SIZE_SHIFT in `largearenarange.h` and RED_BIT in `largebuddyrange.h` derive from the new public `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` so future widenings propagate automatically. A new `MAX_LARGE_SIZECLASS_SIZE` constant gates user-supplied sizes at the API boundary (`alloc_not_small`, `round_size`, `check_size`, `rust_realloc`) — replacing the loose `> 2^63` bound. `ENCODED_ADDRESS_BITS` caps the encoding at `BITS - 1` so the constant survives 32-bit platforms where `DefaultPal::address_bits == BITS`. The `large_size_to_chunk_sizeclass` helper is removed — its `+NUM_SMALL_SIZECLASSES` / `-NUM_SMALL_SIZECLASSES` round-trip through an `lc` index cancels in the uniform scheme, so `size_to_sizeclass_full`'s large branch inlines the `to_exp_mant` directly. Front-end semantics are unchanged: `large_size_to_chunk_size` still returns `next_pow2(size)` and the front end still reserves pow2 chunk sizes. The non-pow2 large sizeclasses exist in `sizeclass_metadata` (with `slab_mask = info.align - 1`) but are unreachable from `size_to_sizeclass_full` until a follow-up commit drops the `next_pow2` rounding. Tests: - `sizeclass.cc`: sentinel sanity, raw-value adjacency, range disjoint, large monotonicity, pow2 round-trip, non-pow2 rounds up. - `rounding.cc`: extends to pow2 large sizeclasses, verifying `index_in_object` / `is_start_of_object` at representative offsets. - `cheri.cc`: large-class verification loop bound updated to `NUM_LARGE_CLASSES`. - Loop bounds in tests use `ENCODED_ADDRESS_BITS` to avoid `bits::one_at_bit(BITS)` UB on 32-bit. ctest: 86/86 passing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend_helpers/arenabins.h | 7 +- .../backend_helpers/largearenarange.h | 34 +-- .../backend_helpers/largebuddyrange.h | 3 +- src/snmalloc/ds/sizeclasstable.h | 193 ++++++++++++------ src/snmalloc/global/globalalloc.h | 13 ++ src/snmalloc/mem/corealloc.h | 10 +- src/snmalloc/mem/metadata.h | 17 +- src/snmalloc/override/rust.cc | 4 +- src/test/func/arenabins/arenabins.cc | 47 ++--- src/test/func/cheri/cheri.cc | 2 +- src/test/func/release-rounding/rounding.cc | 42 ++++ src/test/func/sizeclass/sizeclass.cc | 113 ++++++++++ 12 files changed, 372 insertions(+), 113 deletions(-) diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index 19551481e..6a11f8982 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -272,8 +272,7 @@ namespace snmalloc const carve_info_t& info = carve_info_for_request(n); - size_t req_base = - (block.base + (info.align - 1)) & ~(info.align - 1); + size_t req_base = (block.base + (info.align - 1)) & ~(info.align - 1); size_t pre_size = req_base - block.base; // Servability precondition: `info.size >= n` bytes fit after @@ -718,8 +717,8 @@ namespace snmalloc constexpr size_t MAX_E = bits::BITS - MIN_SIZE_BITS; for (size_t e = 0; e < MAX_E; e++) { - exp_first_sc[e] = - bits::to_exp_mant_const(size_t(1) << (e + MIN_SIZE_BITS)); + exp_first_sc[e] = bits::to_exp_mant_const( + size_t(1) << (e + MIN_SIZE_BITS)); exp_bin_base[e] = e * BINS_PER_EXP; } exp_first_sc[MAX_E] = MAX_SC; diff --git a/src/snmalloc/backend_helpers/largearenarange.h b/src/snmalloc/backend_helpers/largearenarange.h index c5eae63c7..0a7de2e84 100644 --- a/src/snmalloc/backend_helpers/largearenarange.h +++ b/src/snmalloc/backend_helpers/largearenarange.h @@ -15,12 +15,15 @@ namespace snmalloc * Unit 1 (addr + UNIT_SIZE): range-tree node (size ≥ 2 units). * Unit 2 (addr + 2*UNIT_SIZE): large chunk count (size ≥ 3 units). * - * Bit-layout decisions for tree nodes are private to this class: - * - Bits 0–7 of each pagemap word are reserved by the pagemap. - * - Bit 8 is the red bit (both trees). - * - Bits 9–10 of Word::One at unit 0 hold the variant tag. - * - Large chunk count is stored shifted left by 8 in Word::One of - * unit 2. + * Bit-layout decisions for tree nodes are private to this class. The + * pagemap reserves the low bits of each word for the meta-entry (see + * `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT`); the red bit, variant + * tag, and shifted large-chunk count all live at or above that bit: + * - Red bit (both trees) at `BACKEND_LAYOUT_FIRST_FREE_BIT`. + * - Variant tag (Word::One at unit 0) occupies 2 bits starting at + * `BACKEND_LAYOUT_FIRST_FREE_BIT + 1`. + * - Large chunk count is stored in Word::One of unit 2 left-shifted by + * `BACKEND_LAYOUT_FIRST_FREE_BIT`. * * `MIN_SIZE_BITS` is the log2 size of the allocation unit (= pagemap * stride); the caller passes whatever unit it uses (snmalloc's global @@ -39,15 +42,21 @@ namespace snmalloc static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; - // Bit positions inside a pagemap word. Bits 0–7 are reserved by the - // pagemap; tree-node and large-size encodings start at bit 8. - static constexpr unsigned RED_BIT_POS = 8; - static constexpr unsigned VARIANT_SHIFT = 9; + // Bit positions inside a pagemap word. Bits in the reserved region + // (sizeclass + REMOTE_BACKEND_MARKER) are owned by the meta-entry + // layout; tree-node and large-size encodings start at the first free + // bit above that reserved range — see + // `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` in `mem/metadata.h`. + static constexpr unsigned RED_BIT_POS = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; + static constexpr unsigned VARIANT_SHIFT = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT + 1; static constexpr unsigned VARIANT_BITS = 2; // Shift used to encode the large-size chunk count in Word::One of // unit 2. - static constexpr size_t LARGE_SIZE_SHIFT = 8; + static constexpr size_t LARGE_SIZE_SHIFT = + MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; static constexpr uintptr_t RED_BIT = uintptr_t(1) << RED_BIT_POS; static constexpr uintptr_t VARIANT_MASK = @@ -370,8 +379,7 @@ namespace snmalloc } } - auto [ov_addr, ov_size] = - arena.add_block(base.unsafe_uintptr(), size); + auto [ov_addr, ov_size] = arena.add_block(base.unsafe_uintptr(), size); if (ov_addr != 0) parent_dealloc(ov_addr, ov_size); } diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h index 15324753f..bf217bc06 100644 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ b/src/snmalloc/backend_helpers/largebuddyrange.h @@ -37,7 +37,8 @@ namespace snmalloc * a bit that is a valid part of the address of a chunk. * @{ */ - static constexpr address_t RED_BIT = 1 << 8; + static constexpr address_t RED_BIT = address_t(1) + << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; static_assert(RED_BIT < MIN_CHUNK_SIZE); static_assert(MetaEntryBase::is_backend_allowed_value( diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 80b6c9211..3cd7509da 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -17,31 +17,72 @@ namespace snmalloc { using chunksizeclass_t = size_t; - // Large classes range from [MAX_SMALL_SIZECLASS_SIZE, ADDRESS_SPACE). + // Cap the address bits the encoding tries to represent so that + // `MAX_LARGE_SIZECLASS_SIZE` (= 2 ^ ENCODED_ADDRESS_BITS) always fits in + // `size_t`. On 64-bit platforms `DefaultPal::address_bits` is already 48, + // but on 32-bit platforms it equals `bits::BITS` and would otherwise + // overflow the encoded maximum to 0. + constexpr size_t ENCODED_ADDRESS_BITS = + bits::min(DefaultPal::address_bits, bits::BITS - 1); + + // Number of large sizeclasses. Large classes follow on directly from small + // classes in the global exp+mantissa scheme used by + // `bits::from_exp_mant`. The total + // span of representable sizes is from MIN_ALLOC_SIZE up to and including + // 2^ENCODED_ADDRESS_BITS, so the count of large entries beyond the small + // range is (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) mantissa + // cycles, each with 2^INTERMEDIATE_BITS entries. constexpr size_t NUM_LARGE_CLASSES = - DefaultPal::address_bits - MAX_SMALL_SIZECLASS_BITS; + (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS; + + // Bits required to encode any sizeclass value. Slot 0 is reserved as the + // unmapped/default sentinel, so the count includes a leading +1. + constexpr size_t SIZECLASS_BITS = + bits::next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES); + + // Size of the sizeclass-keyed lookup tables and the alignment that the + // REMOTE_BACKEND_MARKER constraint requires of RemoteAllocator. There is no + // separate tag bit: all valid sizeclass raw values are in + // [0, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) and live in the low + // SIZECLASS_BITS bits of a pagemap word. + constexpr size_t SIZECLASS_REP_SIZE = bits::one_at_bit(SIZECLASS_BITS); + + // Largest allocation size representable by the uniform sizeclass encoding. + // Equals `from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`, + // which for the default config is `2 ^ ENCODED_ADDRESS_BITS`. Requests + // strictly larger than this cannot be encoded and must be failed before + // any call to `size_to_sizeclass_full`. + constexpr size_t MAX_LARGE_SIZECLASS_SIZE = + bits::from_exp_mant( + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1); - // How many bits are required to represent either a large or a small - // sizeclass. - constexpr size_t TAG_SIZECLASS_BITS = bits::max( - bits::next_pow2_bits_const(NUM_SMALL_SIZECLASSES), - bits::next_pow2_bits_const(NUM_LARGE_CLASSES + 1)); - - // Number of bits required to represent a tagged sizeclass that can be - // either small or large. - constexpr size_t SIZECLASS_REP_SIZE = - bits::one_at_bit(TAG_SIZECLASS_BITS + 1); + static_assert( + MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit(ENCODED_ADDRESS_BITS), + "MAX_LARGE_SIZECLASS_SIZE must equal 2 ^ ENCODED_ADDRESS_BITS; if this " + "fails, the exp+mantissa math no longer matches NUM_LARGE_CLASSES."); + static_assert( + ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS, + "ENCODED_ADDRESS_BITS must exceed MAX_SMALL_SIZECLASS_BITS so the large " + "range is non-empty."); /** - * Encapsulates a tagged union of large and small sizeclasses. + * Represents a sizeclass identifier shared by small and large allocations + * using a single uniform encoding: + * + * value == 0 : unmapped / default sentinel + * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small sizeclass sc = value - 1 + * value ∈ [1 + NUM_SMALL_SIZECLASSES, + * 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) + * : large class lc = + * value - 1 - + * NUM_SMALL_SIZECLASSES * - * Used in various lookup tables to make efficient code that handles - * all objects allocated by snmalloc. + * Used directly as an index into `sizeclass_metadata`. Slot 0 of that table + * is zero-padded so the sentinel can flow through the fast-path table + * lookups without a subtract on the hot path. */ class sizeclass_t { - static constexpr size_t TAG = bits::one_at_bit(TAG_SIZECLASS_BITS); - size_t value{0}; constexpr sizeclass_t(size_t value) : value(value) {} @@ -51,20 +92,19 @@ namespace snmalloc static constexpr sizeclass_t from_small_class(smallsizeclass_t sc) { - SNMALLOC_ASSERT(sc < TAG); - // Note could use `+` or `|`. Using `+` as will combine nicely with array - // offset. - return {TAG + sc}; + SNMALLOC_ASSERT(sc < NUM_SMALL_SIZECLASSES); + return {sc + 1}; } /** - * Takes the number of leading zero bits from the actual large size-1. - * See size_to_sizeclass_full + * Construct from a large class index `lc` in [0, NUM_LARGE_CLASSES). + * Large classes are stored as a contiguous run immediately after the + * small range and the sentinel slot. */ static constexpr sizeclass_t from_large_class(size_t large_class) { - SNMALLOC_ASSERT(large_class < TAG); - return {large_class}; + SNMALLOC_ASSERT(large_class < NUM_LARGE_CLASSES); + return {1 + NUM_SMALL_SIZECLASSES + large_class}; } static constexpr sizeclass_t from_raw(size_t raw) @@ -72,21 +112,16 @@ namespace snmalloc return {raw}; } - constexpr size_t index() - { - return value & (TAG - 1); - } - constexpr smallsizeclass_t as_small() { SNMALLOC_ASSERT(is_small()); - return smallsizeclass_t(value & (TAG - 1)); + return smallsizeclass_t(value - 1); } constexpr chunksizeclass_t as_large() { - SNMALLOC_ASSERT(!is_small()); - return bits::BITS - (value & (TAG - 1)); + SNMALLOC_ASSERT(!is_small() && !is_default()); + return value - 1 - NUM_SMALL_SIZECLASSES; } constexpr size_t raw() @@ -96,7 +131,9 @@ namespace snmalloc constexpr bool is_small() { - return (value & TAG) != 0; + // Sentinel (value == 0) underflows to a large positive value, which + // also fails the comparison — the sentinel is therefore not small. + return (value - 1) < NUM_SMALL_SIZECLASSES; } constexpr bool is_default() @@ -108,6 +145,11 @@ namespace snmalloc { return value == other.value; } + + constexpr bool operator!=(sizeclass_t other) + { + return value != other.value; + } }; using sizeclass_compress_t = uint8_t; @@ -179,6 +221,17 @@ namespace snmalloc constexpr SizeClassTable() { + // Sentinel slot (sizeclass_t{} / raw 0) covers any address whose + // pagemap entry is unmapped or owned by the backend — including + // foreign (non-snmalloc) heap addresses reached via the + // bounds-checked memcpy shim before snmalloc has seen them. + // `slab_mask = ~size_t(0)` makes `start_of_object` collapse + // `addr & ~slab_mask` to 0 and `index_in_object` to `addr`, so + // `remaining_bytes = sentinel.size - addr` underflows to a very + // large value and any memcpy bound check trivially passes the + // sentinel through to the destination's native checks. + start_[0].slab_mask = ~size_t(0); + size_t max_capacity = 0; for (smallsizeclass_t sizeclass(0); sizeclass < NUM_SMALL_SIZECLASSES; @@ -223,12 +276,23 @@ namespace snmalloc meta.mod_zero_mult = (~zero / meta.size) + 1; } - for (size_t sizeclass = 0; sizeclass < bits::BITS; sizeclass++) + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) { - auto lsc = sizeclass_t::from_large_class(sizeclass); + auto lsc = sizeclass_t::from_large_class(lc); auto& meta = fast(lsc); - meta.size = sizeclass == 0 ? 0 : bits::one_at_bit(lsc.as_large()); - meta.slab_mask = meta.size - 1; + // Continuous global exp+mantissa scheme: small classes occupy + // global indices [0, NUM_SMALL_SIZECLASSES); large classes occupy + // [NUM_SMALL_SIZECLASSES, NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES). + size_t size = + bits::from_exp_mant( + NUM_SMALL_SIZECLASSES + lc); + meta.size = size; + // Natural alignment of the size: the largest power of two that + // divides `size`. For pow2 sizes, this equals `size`; for non-pow2 + // mantissa steps it is the slab granularity at which the allocation + // tiles. `slab_mask = align - 1`. + size_t align = size & (~size + 1); + meta.slab_mask = align - 1; // The slab_mask will do all the necessary work, so // perform identity multiplication for the test. meta.mod_zero_mult = 1; @@ -241,6 +305,16 @@ namespace snmalloc constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); + // Slot 0 of `sizeclass_metadata` is the unmapped sentinel; it must remain + // zero-initialised so fast-path lookups via `fast(sc)` return zero size + // and slab_mask without needing a sentinel check before indexing. + static_assert( + sizeclass_metadata.fast(sizeclass_t{}).size == 0, + "sentinel slot must have size 0"); + static_assert( + sizeclass_metadata.fast(sizeclass_t{}).slab_mask == 0, + "sentinel slot must have slab_mask 0"); + static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); @@ -291,16 +365,6 @@ namespace snmalloc return bits::one_at_bit(MIN_CHUNK_BITS + sizeclass); } - /** - * For large allocations, the metaentry stores the raw log_2 of the size, - * which must be shifted into the index space of slab_sizeclass-es. - */ - constexpr size_t - metaentry_chunk_sizeclass_to_slab_sizeclass(chunksizeclass_t sizeclass) - { - return sizeclass - MIN_CHUNK_BITS; - } - constexpr uint16_t sizeclass_to_slab_object_count(smallsizeclass_t sizeclass) { return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) @@ -378,10 +442,6 @@ namespace snmalloc return bits::next_pow2(size); } - inline static size_t large_size_to_chunk_sizeclass(size_t size) - { - return bits::next_pow2_bits(size) - MIN_CHUNK_BITS; - } constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { @@ -456,13 +516,17 @@ namespace snmalloc } /** - * A compressed size representation, - * either a small size class with the 7th bit set - * or a large class with the 7th bit not set. - * Large classes are stored as a mask shift. - * size = (~0 >> lc) + 1; - * Thus large size class 0, has size 0. - * And large size class 33, has size 2^31 + * Maps a requested size to its sizeclass. The result uses the unified + * encoding documented on `sizeclass_t`. + * + * For small sizes, this delegates to `size_to_sizeclass`. For large + * sizes in Phase 13, this rounds up to the next power of two (the + * front end still requests pow2-rounded reservations); Phase 15 + * removes the `next_pow2` call to enable non-pow2 large reservations. + * + * `to_exp_mant` is the literal inverse of the `from_exp_mant` used + * when populating `sizeclass_metadata`, so this never indexes the + * wrong slot. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { @@ -470,9 +534,12 @@ namespace snmalloc { return sizeclass_t::from_small_class(size_to_sizeclass(size)); } - // bits::clz is undefined on 0, but we have size == 1 has already been - // handled here. We conflate 0 and sizes larger than we can allocate. - return sizeclass_t::from_large_class(bits::clz(size - 1)); + SNMALLOC_ASSERT(size != 0); + SNMALLOC_ASSERT(size <= MAX_LARGE_SIZECLASS_SIZE); + size_t pow2 = bits::next_pow2(size); + size_t global = + bits::to_exp_mant(pow2); + return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); } inline SNMALLOC_FAST_PATH static size_t round_size(size_t size) @@ -492,7 +559,7 @@ namespace snmalloc return sizeclass_to_size(size_to_sizeclass(1)); } - if (size > bits::one_at_bit(bits::BITS - 1)) + if (size > MAX_LARGE_SIZECLASS_SIZE) { // This size is too large, no rounding should occur as will result in a // failed allocation later. diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index 5b428e2f1..eb20210fc 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -287,6 +287,19 @@ namespace snmalloc if (!entry.is_owned()) return; size = size == 0 ? 1 : size; + // Any size beyond what the sizeclass encoding can represent is + // necessarily a mismatch with the pagemap's recorded sizeclass; report + // it directly rather than feeding the unrepresentable size into + // `size_to_sizeclass_full`. + if (size > MAX_LARGE_SIZECLASS_SIZE) + { + snmalloc_check_client( + mitigations(sanity_checks), + p == nullptr, + "Dealloc size exceeds encodable range: {}", + size); + return; + } auto sc = size_to_sizeclass_full(size); auto pm_sc = entry.get_sizeclass(); auto rsize = sizeclass_full_to_size(sc); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 127abc76a..10482b6b7 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -700,10 +700,11 @@ namespace snmalloc [](Allocator* self, size_t size) SNMALLOC_FAST_PATH_LAMBDA { return CheckInit::check_init( [self, size]() SNMALLOC_FAST_PATH_LAMBDA { - if (size > bits::one_at_bit(bits::BITS - 1)) + if (size > MAX_LARGE_SIZECLASS_SIZE) { - // Cannot allocate something that is more that half the size of - // the address space + // Cannot allocate something the sizeclass encoding cannot + // represent (equals `2 ^ ENCODED_ADDRESS_BITS` in + // `sizeclasstable.h` — well above any plausible request). return Conts::failure(size); } @@ -1117,8 +1118,7 @@ namespace snmalloc // XXX: because large objects have unique metadata associated with them, // the ring size here is one. We should probably assert that. - size_t entry_sizeclass = entry.get_sizeclass().as_large(); - size_t size = bits::one_at_bit(entry_sizeclass); + size_t size = sizeclass_full_to_size(entry.get_sizeclass()); #ifdef SNMALLOC_TRACING message<1024>("Large deallocation: {}", size); diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index c6d29793e..3f65f39b6 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -40,9 +40,12 @@ namespace snmalloc * backend/largebuddyrange.h. * * This value is statically checked by the frontend to ensure that its - * bit packing does not conflict; see mem/remoteallocator.h + * bit packing does not conflict; see mem/remoteallocator.h. The marker + * tracks the sizeclass-encoding width (see `SIZECLASS_REP_SIZE` in + * ds/sizeclasstable.h): it must sit immediately above the highest bit + * used by a sizeclass raw value. */ - static constexpr address_t REMOTE_BACKEND_MARKER = 1 << 7; + static constexpr address_t REMOTE_BACKEND_MARKER = SIZECLASS_REP_SIZE; /** * Bit used to indicate this should not be considered part of the previous @@ -111,6 +114,16 @@ namespace snmalloc (REMOTE_BACKEND_MARKER << 1) - 1; public: + /** + * Bit position of the first bit available to backend metadata layouts + * above the reserved region. The reserved region runs from bit 0 up to + * and including the `REMOTE_BACKEND_MARKER` bit; layouts in + * `largearenarange.h` and `largebuddyrange.h` derive their bit + * positions (RED_BIT, VARIANT_SHIFT, LARGE_SIZE_SHIFT, ...) from this. + */ + static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = + bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; + /** * Does the back end currently own this entry? Note that freshly * allocated entries are owned by the front end until explicitly diff --git a/src/snmalloc/override/rust.cc b/src/snmalloc/override/rust.cc index f07e51073..d2e7e2e08 100644 --- a/src/snmalloc/override/rust.cc +++ b/src/snmalloc/override/rust.cc @@ -39,8 +39,10 @@ extern "C" SNMALLOC_EXPORT void* SNMALLOC_NAME_MANGLE(rust_realloc)( size_t aligned_old_size = aligned_size(alignment, old_size), aligned_new_size = aligned_size(alignment, new_size); if ( + aligned_old_size <= MAX_LARGE_SIZECLASS_SIZE && + aligned_new_size <= MAX_LARGE_SIZECLASS_SIZE && size_to_sizeclass_full(aligned_old_size).raw() == - size_to_sizeclass_full(aligned_new_size).raw()) + size_to_sizeclass_full(aligned_new_size).raw()) return ptr; void* p = alloc(aligned_new_size); if (p) diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc index 612eb3cd2..05f1ee308 100644 --- a/src/test/func/arenabins/arenabins.cc +++ b/src/test/func/arenabins/arenabins.cc @@ -127,7 +127,8 @@ namespace snmalloc bitmap_info_for_request_const(size_t n) { return Bins::table_ - .bitmap_info[bits::to_exp_mant_const(n)]; + .bitmap_info[bits::to_exp_mant_const( + n)]; } /// `carve_info_for_request`, constexpr (uses `to_exp_mant_const`). @@ -135,7 +136,8 @@ namespace snmalloc static constexpr const carve_info_t& carve_info_for_request_const(size_t n) { return Bins::table_ - .carve_info[bits::to_exp_mant_const(n)]; + .carve_info[bits::to_exp_mant_const( + n)]; } // The canonical source of truth for what each within-exponent bin @@ -218,14 +220,11 @@ namespace static_checks "B=3 MAX_SC"); // Sizes that are powers of two have align == size. - static_assert( - B2::carve_info_for_request_const(4).align == 4, "size 4 align"); - static_assert( - B3::carve_info_for_request_const(8).align == 8, "size 8 align"); + static_assert(B2::carve_info_for_request_const(4).align == 4, "size 4 align"); + static_assert(B3::carve_info_for_request_const(8).align == 8, "size 8 align"); // sc_size at request(s) must be >= s. - static_assert( - B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); + static_assert(B2::carve_info_for_request_const(9).size == 10, "B=2 round-up"); static_assert( B3::carve_info_for_request_const(17).size == 18, "B=3 round-up"); } // namespace static_checks @@ -456,14 +455,12 @@ namespace const auto& ci = Bins::carve_info_for_request(s); if (ci.size != Bins::sc_size(sc)) { - std::printf( - "B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); + std::printf("B=%zu carve_info_for_request(%zu).size mismatch\n", B, s); std::abort(); } if (ci.align != Bins::sc_align(sc)) { - std::printf( - "B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); + std::printf("B=%zu carve_info_for_request(%zu).align mismatch\n", B, s); std::abort(); } if (&ci != &Bins::carve_info(sc)) @@ -536,8 +533,7 @@ namespace size_t step = Bins::max_supported_size() / 257; if (step == 0) step = 1; - for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; - n += step + 1) + for (size_t n = 1; n <= Bins::max_supported_size() && n > 0; n += step + 1) check_one(n); } @@ -1246,7 +1242,8 @@ namespace // request(n) at MIN_SIZE_BITS==0; sc_size(raw) at MIN_SIZE_BITS==K // equals sc_size(raw) at MIN_SIZE_BITS==0 times U; sc_align // likewise. - size_t probe[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; + size_t probe[] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 16, 17, 32, 65, 127, 1024}; for (size_t n : probe) { // Skip values that would overflow either instance's domain. @@ -1270,8 +1267,9 @@ namespace using BaseR = typename Base::range_t; for (size_t n = 1; n <= 64; n++) for (size_t a = 0; a < 32; a++) - if (Scaled::bin_index(ScaledR{a * U, n * U}) != - Base::bin_index(BaseR{a, n})) + if ( + Scaled::bin_index(ScaledR{a * U, n * U}) != + Base::bin_index(BaseR{a, n})) std::abort(); // carve({0, blk*U}, n*U) returns the same partition as @@ -1287,14 +1285,17 @@ namespace continue; auto base_cv = Base::carve(BaseR{0, blk}, n); auto scaled_cv = Scaled::carve(ScaledR{0, blk * U}, n * U); - if (scaled_cv.pre.base != base_cv.pre.base * U || - scaled_cv.pre.size != base_cv.pre.size * U) + if ( + scaled_cv.pre.base != base_cv.pre.base * U || + scaled_cv.pre.size != base_cv.pre.size * U) std::abort(); - if (scaled_cv.req.base != base_cv.req.base * U || - scaled_cv.req.size != base_cv.req.size * U) + if ( + scaled_cv.req.base != base_cv.req.base * U || + scaled_cv.req.size != base_cv.req.size * U) std::abort(); - if (scaled_cv.post.base != base_cv.post.base * U || - scaled_cv.post.size != base_cv.post.size * U) + if ( + scaled_cv.post.base != base_cv.post.base * U || + scaled_cv.post.size != base_cv.post.size * U) std::abort(); } diff --git a/src/test/func/cheri/cheri.cc b/src/test/func/cheri/cheri.cc index 1928dbbd5..7e2318e11 100644 --- a/src/test/func/cheri/cheri.cc +++ b/src/test/func/cheri/cheri.cc @@ -266,7 +266,7 @@ int main() SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz)); } - for (size_t sc = 0; sc < bits::BITS; sc++) + for (size_t sc = 0; sc < NUM_LARGE_CLASSES; sc++) { size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(sc)); SNMALLOC_CHECK(sz == Aal::capptr_size_round(sz)); diff --git a/src/test/func/release-rounding/rounding.cc b/src/test/func/release-rounding/rounding.cc index 4d11eaafb..13155678a 100644 --- a/src/test/func/release-rounding/rounding.cc +++ b/src/test/func/release-rounding/rounding.cc @@ -51,5 +51,47 @@ int main(int argc, char** argv) if (failed) abort(); } + + // Exercise pow2 large sizeclasses end-to-end. + // For each pow2 size S that the front end actually reaches (lc values that + // are pow2-aligned in the global exp+mantissa scheme), verify + // index_in_object / is_start_of_object at a representative set of offsets: + // the start of an object, an arbitrary interior offset, and the start of + // the next object. Bound the loop by ENCODED_ADDRESS_BITS so + // `bits::one_at_bit(b)` never shifts by >= BITS. + for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) + { + size_t S = bits::one_at_bit(b); + sizeclass_t sc = size_to_sizeclass_full(S); + + address_t base = address_t(0); + size_t offsets[] = {0, 1, S / 2, S - 1, S}; + for (size_t off : offsets) + { + address_t addr = base + off; + size_t expected_mod = off % S; + bool expected_start = expected_mod == 0; + + size_t opt_mod = index_in_object(sc, addr); + if (opt_mod != expected_mod) + { + std::cout << "Large S=" << S << " offset=" << off + << " index_in_object=" << opt_mod + << " expected=" << expected_mod << std::endl; + failed = true; + } + + bool opt_start = is_start_of_object(sc, addr); + if (opt_start != expected_start) + { + std::cout << "Large S=" << S << " offset=" << off + << " is_start_of_object=" << opt_start + << " expected=" << expected_start << std::endl; + failed = true; + } + } + if (failed) + abort(); + } return 0; } diff --git a/src/test/func/sizeclass/sizeclass.cc b/src/test/func/sizeclass/sizeclass.cc index ac7ec6bd8..093b17424 100644 --- a/src/test/func/sizeclass/sizeclass.cc +++ b/src/test/func/sizeclass/sizeclass.cc @@ -67,6 +67,118 @@ void test_align_size() abort(); } +void test_uniform_large_sizeclasses() +{ + using namespace snmalloc; + bool failed = false; + + // Sentinel sanity: default-constructed sizeclass_t is the unmapped sentinel + // and not classified as small. + if (sizeclass_t{}.raw() != 0) + { + std::cout << "Default sizeclass_t raw is " << sizeclass_t{}.raw() + << " expected 0" << std::endl; + failed = true; + } + if (sizeclass_t{}.is_default() != true) + { + std::cout << "Default sizeclass_t .is_default() is false" << std::endl; + failed = true; + } + if (sizeclass_t{}.is_small()) + { + std::cout << "Default sizeclass_t.is_small() is true" << std::endl; + failed = true; + } + + // Encoding sanity: small range and large range are disjoint and adjacent + // in the value space. + if (sizeclass_t::from_small_class(smallsizeclass_t(0)).raw() != 1) + { + std::cout << "from_small_class(0).raw() != 1" << std::endl; + failed = true; + } + if ( + sizeclass_t::from_small_class(smallsizeclass_t(NUM_SMALL_SIZECLASSES - 1)) + .raw() + + 1 != + sizeclass_t::from_large_class(0).raw()) + { + std::cout << "Small/large ranges are not adjacent" << std::endl; + failed = true; + } + if ( + sizeclass_t::from_large_class(NUM_LARGE_CLASSES - 1).raw() >= + SIZECLASS_REP_SIZE) + { + std::cout << "Largest large sizeclass overflows SIZECLASS_REP_SIZE" + << std::endl; + failed = true; + } + if (!sizeclass_t::from_small_class(smallsizeclass_t(0)).is_small()) + { + std::cout << "from_small_class(0).is_small() is false" << std::endl; + failed = true; + } + if (sizeclass_t::from_large_class(0).is_small()) + { + std::cout << "from_large_class(0).is_small() is true" << std::endl; + failed = true; + } + + // Large sizeclasses are strictly increasing in size with lc. + size_t prev_size = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t size = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (size <= prev_size) + { + std::cout << "Non-monotonic large sizeclass: lc=" << lc + << " size=" << size << " prev=" << prev_size << std::endl; + failed = true; + } + prev_size = size; + } + + // Round-trip identity on pow2 large sizes in Phase 13: every pow2 size + // S in [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must satisfy + // sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. Bound the loop by + // ENCODED_ADDRESS_BITS so `bits::one_at_bit(bits)` never shifts by >= BITS + // (the bound check itself would fail on 32-bit otherwise). + for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) + { + size_t S = bits::one_at_bit(b); + sizeclass_t sc = size_to_sizeclass_full(S); + size_t rs = sizeclass_full_to_size(sc); + if (rs != S) + { + std::cout << "Pow2 round-trip failed: S=" << S << " round=" << rs + << std::endl; + failed = true; + } + + // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), the + // result must round up to 2P (pow2 rounding still in force in Phase 13). + // Only check when 2P is still representable. + if (b < ENCODED_ADDRESS_BITS) + { + size_t mid = S + (S >> 1); + sizeclass_t sc_mid = size_to_sizeclass_full(mid); + size_t rs_mid = sizeclass_full_to_size(sc_mid); + size_t expect = bits::one_at_bit(b + 1); + if (rs_mid != expect) + { + std::cout << "Non-pow2 should round to next pow2: X=" << mid + << " round=" << rs_mid << " expected=" << expect << std::endl; + failed = true; + } + } + } + + if (failed) + abort(); +} + int main(int, char**) { setup(); @@ -149,4 +261,5 @@ int main(int, char**) abort(); test_align_size(); + test_uniform_large_sizeclasses(); } From f16524500d8d0f6603294c895e6e46dab6443f6b Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Wed, 3 Jun 2026 17:03:57 +0100 Subject: [PATCH 10/15] Per-chunk pagemap offset encoding Encode (sizeclass, slab-offset) jointly in the pagemap entry so the front end can recover the allocation start for an arbitrary interior chunk of a multi-slab-tile large allocation. The front end still only issues pow2 large requests, so every materialised entry today has offset=0; this lays the groundwork for non-pow2 large support without front-end changes. Key pieces: - offset_and_sizeclass_t packs sizeclass into the low SIZECLASS_BITS and per-chunk offset into the next OFFSET_BITS of one word. - Backend::alloc_chunk loops over slab tiles, writing each tile's slab_index into the offset bits of its pagemap entry. - SizeClassTable is split into three by purpose: * start_ (sizeclass_data_start, 32B/row, indexed by osc): hot path for start_of_object on every dealloc. * align_ (sizeclass_data_align, 16B/row, indexed by sc): used by is_start_of_object alignment check in -check builds. * slab_ (sizeclass_data_slab, 4B/row, indexed by sc): cold; slab init thresholds. - start_of_object branches on osc.offset() == 0 (testable from bits already loaded in osc.raw()), so the offset=0 hot path skips the offset_bytes load and offset-shift arithmetic. Combined with the table split, perf-external_pointer-fast matches the baseline (~290 ms median) with no regression; perf-singlethread-check is within noise. - New src/test/func/large_offset targeted test reaches the multi-slab-tile branch via the public backend API. - check_invariant in Arena now uses SNMALLOC_CHECK rather than SNMALLOC_ASSERT, so callers that opt in via enabled=true get the invariant checks even in Release builds (which is what the tests want); the #ifndef NDEBUG wrapper is no longer needed. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/backend.h | 26 +- src/snmalloc/backend_helpers/arena.h | 27 +- src/snmalloc/backend_helpers/arenabins.h | 2 +- .../backend_helpers/largearenarange.h | 37 +- .../backend_helpers/largebuddyrange.h | 5 +- src/snmalloc/ds/sizeclasstable.h | 419 ++++++++++++------ src/snmalloc/global/globalalloc.h | 14 +- src/snmalloc/mem/corealloc.h | 4 +- src/snmalloc/mem/metadata.h | 202 +++++---- src/snmalloc/override/rust.cc | 4 +- src/test/func/arena/arena.cc | 2 +- src/test/func/arenabins/arenabins.cc | 6 +- src/test/func/large_offset/large_offset.cc | 225 ++++++++++ src/test/func/release-rounding/rounding.cc | 40 +- 14 files changed, 747 insertions(+), 266 deletions(-) create mode 100644 src/test/func/large_offset/large_offset.cc diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 2772cf319..80ff58da8 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -128,8 +128,30 @@ namespace snmalloc return {nullptr, nullptr}; } - typename Pagemap::Entry t(meta, ras); - Pagemap::set_metaentry(address_cast(p), size, t); + const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); + // `size` and `slab_size` are powers of two with `size >= slab_size`, + // so `size = k * slab_size` for some integer `k >= 1`. Each slab + // tile gets the same `ras_in | (slab_index << SIZECLASS_BITS)` + // entry, written in one `set_metaentry` call. + SNMALLOC_ASSERT(size >= slab_size); + // The OR below assumes the per-chunk-offset bits of `ras` are + // zero; `MetaEntryBase::encode` defaults offset to 0, and the + // backend is the only place per-chunk offsets are written. + SNMALLOC_ASSERT( + (ras & (((size_t{1} << OFFSET_BITS) - 1) << SIZECLASS_BITS)) == 0); + for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += slab_size) + { + const size_t slab_index = chunk_offset / slab_size; + // `compute_max_large_slab_index() < (1 << OFFSET_BITS)` is + // static_asserted in sizeclasstable.h; this asserts the + // arithmetic that derives `slab_index` from `size`/`slab_size`. + SNMALLOC_ASSERT(slab_index < (size_t{1} << OFFSET_BITS)); + const uintptr_t ras_i = ras | (slab_index << SIZECLASS_BITS); + typename Pagemap::Entry t_i(meta, ras_i); + Pagemap::set_metaentry( + address_cast(p) + chunk_offset, slab_size, t_i); + } return {Aal::capptr_bound(p, size), meta}; } diff --git a/src/snmalloc/backend_helpers/arena.h b/src/snmalloc/backend_helpers/arena.h index 338838cda..4330637a7 100644 --- a/src/snmalloc/backend_helpers/arena.h +++ b/src/snmalloc/backend_helpers/arena.h @@ -288,7 +288,10 @@ namespace snmalloc /** * Structural invariant. Runs when `enabled` is true; defaults to - * `Debug` so release tests can pass `true` explicitly. + * `Debug` so in-tree callers compile away in Release while tests + * can opt in by passing `true` explicitly. Uses `SNMALLOC_CHECK` + * rather than `SNMALLOC_ASSERT` so that test-driven invocations + * are checked even under NDEBUG. * * Five clauses are verified: * 1. Maximally consolidated — no adjacent free blocks could be @@ -321,7 +324,7 @@ namespace snmalloc if (prev_valid) { uintptr_t prev_end = prev_addr + prev_size; - SNMALLOC_ASSERT(prev_end != a || !Rep::can_consolidate(a)); + SNMALLOC_CHECK(prev_end != a || !Rep::can_consolidate(a)); } prev_addr = a; prev_size = s; @@ -333,10 +336,10 @@ namespace snmalloc range_tree.for_each([&](uintptr_t node) { auto [a, s] = range_from_addr(node); if (a >= UNIT_SIZE) - SNMALLOC_ASSERT( + SNMALLOC_CHECK( !contains_min(a - UNIT_SIZE) || !Rep::can_consolidate(a)); uintptr_t end = a + s; - SNMALLOC_ASSERT(!contains_min(end) || !Rep::can_consolidate(end)); + SNMALLOC_CHECK(!contains_min(end) || !Rep::can_consolidate(end)); }); // 1c. No two adjacent min blocks (unless boundary). @@ -347,7 +350,7 @@ namespace snmalloc if (Rep::get_variant(node) != ArenaVariant::Min) return; if (prev_valid) - SNMALLOC_ASSERT( + SNMALLOC_CHECK( prev + UNIT_SIZE != node || !Rep::can_consolidate(node)); prev = node; prev_valid = true; @@ -366,7 +369,7 @@ namespace snmalloc if (s >= TWO_UNITS) { auto path = range_tree.get_root_path(); - SNMALLOC_ASSERT(range_tree.find(path, node)); + SNMALLOC_CHECK(range_tree.find(path, node)); bin_tree_nonmin_count++; } }); @@ -378,10 +381,10 @@ namespace snmalloc auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); auto path = bin_trees[expected_bin].get_root_path(); - SNMALLOC_ASSERT(bin_trees[expected_bin].find(path, node)); + SNMALLOC_CHECK(bin_trees[expected_bin].find(path, node)); }); - SNMALLOC_ASSERT(bin_tree_nonmin_count == range_tree_count); + SNMALLOC_CHECK(bin_tree_nonmin_count == range_tree_count); } // 3. Bin classification correctness. @@ -391,7 +394,7 @@ namespace snmalloc auto [a, s] = range_from_addr(node); auto range = typename Bins::range_t{a, s}; size_t expected_bin = Bins::bin_index(range); - SNMALLOC_ASSERT(expected_bin == bin); + SNMALLOC_CHECK(expected_bin == bin); }); } @@ -400,7 +403,7 @@ namespace snmalloc { bool has_entries = !bin_trees[bin].is_empty(); bool bit_set = bitmap.test(bin); - SNMALLOC_ASSERT(has_entries == bit_set); + SNMALLOC_CHECK(has_entries == bit_set); } // 5. Variant-tag consistency. @@ -409,9 +412,9 @@ namespace snmalloc bin_trees[bin].for_each([&](uintptr_t node) { auto v = Rep::get_variant(node); auto [a, s] = range_from_addr(node); - SNMALLOC_ASSERT(v == variant_of(s, a)); + SNMALLOC_CHECK(v == variant_of(s, a)); if (v == ArenaVariant::Large) - SNMALLOC_ASSERT(Rep::get_large_size(node) == s); + SNMALLOC_CHECK(Rep::get_large_size(node) == s); }); } } diff --git a/src/snmalloc/backend_helpers/arenabins.h b/src/snmalloc/backend_helpers/arenabins.h index 6a11f8982..ccfb23ca8 100644 --- a/src/snmalloc/backend_helpers/arenabins.h +++ b/src/snmalloc/backend_helpers/arenabins.h @@ -47,7 +47,7 @@ namespace snmalloc { static_assert( INTERMEDIATE_BITS >= 1 && INTERMEDIATE_BITS <= 3, - "ArenaBins currently supports B in {1, 2, 3}"); + "ArenaBins supports B in {1, 2, 3}"); static_assert( MIN_SIZE_BITS + INTERMEDIATE_BITS < bits::BITS, "MIN_SIZE_BITS + INTERMEDIATE_BITS must leave room for at least one " diff --git a/src/snmalloc/backend_helpers/largearenarange.h b/src/snmalloc/backend_helpers/largearenarange.h index 0a7de2e84..f53643bf4 100644 --- a/src/snmalloc/backend_helpers/largearenarange.h +++ b/src/snmalloc/backend_helpers/largearenarange.h @@ -42,10 +42,11 @@ namespace snmalloc static constexpr uintptr_t UNIT_SIZE = uintptr_t(1) << MIN_SIZE_BITS; - // Bit positions inside a pagemap word. Bits in the reserved region - // (sizeclass + REMOTE_BACKEND_MARKER) are owned by the meta-entry - // layout; tree-node and large-size encodings start at the first free - // bit above that reserved range — see + // Bit positions inside a pagemap word. The reserved region (the + // sizeclass+offset bits on Word::Two, and META_BOUNDARY_BIT on + // Word::One) is owned by the meta-entry layout; tree-node and + // large-size encodings start at the first free bit above that + // reserved range — see // `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` in `mem/metadata.h`. static constexpr unsigned RED_BIT_POS = MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; @@ -72,17 +73,19 @@ namespace snmalloc static_assert(BIN_META_MASK < UNIT_SIZE); static_assert( Entry::is_backend_allowed_value(Entry::Word::One, BIN_META_MASK)); - static_assert(Entry::is_backend_allowed_value(Entry::Word::Two, RED_BIT)); + static_assert( + Entry::is_backend_allowed_value( + Entry::Word::Two, ~uintptr_t(UNIT_SIZE - 1)), + "RangeRep stores chunk-aligned addresses in Word::Two; the " + "markerless ownership discriminator requires their low " + "BACKEND_RESERVED_MASK_WORD_TWO bits to be zero. This asserts " + "that the reserved mask fits entirely below the chunk alignment, " + "so no chunk-aligned value (any bit set only at position " + ">= MIN_SIZE_BITS) can collide."); using Word = typename Entry::Word; using Handle = typename Entry::BackendStateWordRef; - /** - * Pagemap word for the `UnitIdx`-th unit of the block at `addr`. - * Centralises the layout decision "which pagemap entry encodes - * data for unit i". Used by `TreeRep::ref` and by the variant / - * large-size accessors below. - */ template static Handle word_at(uintptr_t addr, Word w) { @@ -92,11 +95,11 @@ namespace snmalloc } /** - * RBTree Rep shared by `BinRep` and `RangeRep`. `UnitIdx` selects - * which unit (0 or 1) of the block holds this Rep's tree node; the - * Rep's pagemap words live at `addr + UnitIdx * UNIT_SIZE`. - * `MetaMask` covers the bits in that node's words that are owned by - * this Rep (red + any tag bits) and must be preserved by get/set. + * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the + * block-relative pagemap unit (0 or 1) that holds this Rep's + * node; `MetaMask` covers bits in that unit's words owned by + * this Rep (red + variant tag for `BinRep`, red only for + * `RangeRep`) and must be preserved across get/set. */ template struct TreeRep @@ -111,7 +114,7 @@ namespace snmalloc { static const Contents null_entry = 0; if (SNMALLOC_UNLIKELY(k == 0)) - return Handle{const_cast(&null_entry)}; + return Handle{const_cast(&null_entry), 0}; return word_at(k, direction ? Word::One : Word::Two); } diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h index bf217bc06..3eb5f5c21 100644 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ b/src/snmalloc/backend_helpers/largebuddyrange.h @@ -76,10 +76,13 @@ namespace snmalloc // Special case for accessing the null entry. We want to make sure // that this is never modified by the back end, so we make it point to // a constant entry and use the MMU to trap even in release modes. + // The mask passed to the handle is irrelevant: the null entry is + // never written (any attempt would trap), and on read its underlying + // value is zero so `get()` returns zero regardless of the mask. static const Contents null_entry = 0; if (SNMALLOC_UNLIKELY(address_cast(k) == 0)) { - return {const_cast(&null_entry)}; + return {const_cast(&null_entry), 0}; } auto& entry = Pagemap::template get_metaentry_mut(address_cast(k)); if (direction) diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index 3cd7509da..ae4c0df3a 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -17,41 +17,39 @@ namespace snmalloc { using chunksizeclass_t = size_t; - // Cap the address bits the encoding tries to represent so that - // `MAX_LARGE_SIZECLASS_SIZE` (= 2 ^ ENCODED_ADDRESS_BITS) always fits in - // `size_t`. On 64-bit platforms `DefaultPal::address_bits` is already 48, - // but on 32-bit platforms it equals `bits::BITS` and would otherwise - // overflow the encoded maximum to 0. + // Capped to `bits::BITS - 1` so `MAX_LARGE_SIZECLASS_SIZE` fits in + // `size_t` on 32-bit platforms where `DefaultPal::address_bits == + // bits::BITS`. constexpr size_t ENCODED_ADDRESS_BITS = bits::min(DefaultPal::address_bits, bits::BITS - 1); - // Number of large sizeclasses. Large classes follow on directly from small - // classes in the global exp+mantissa scheme used by - // `bits::from_exp_mant`. The total - // span of representable sizes is from MIN_ALLOC_SIZE up to and including - // 2^ENCODED_ADDRESS_BITS, so the count of large entries beyond the small - // range is (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) mantissa - // cycles, each with 2^INTERMEDIATE_BITS entries. + // Large classes follow on directly from small classes in the global + // exp+mantissa scheme: `(ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS)` + // mantissa cycles of `2^INTERMEDIATE_BITS` entries each. constexpr size_t NUM_LARGE_CLASSES = (ENCODED_ADDRESS_BITS - MAX_SMALL_SIZECLASS_BITS) << INTERMEDIATE_BITS; - // Bits required to encode any sizeclass value. Slot 0 is reserved as the - // unmapped/default sentinel, so the count includes a leading +1. + // Slot 0 of the table is reserved as the unmapped sentinel, hence +1. constexpr size_t SIZECLASS_BITS = bits::next_pow2_bits_const(1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES); - // Size of the sizeclass-keyed lookup tables and the alignment that the - // REMOTE_BACKEND_MARKER constraint requires of RemoteAllocator. There is no - // separate tag bit: all valid sizeclass raw values are in - // [0, 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) and live in the low - // SIZECLASS_BITS bits of a pagemap word. constexpr size_t SIZECLASS_REP_SIZE = bits::one_at_bit(SIZECLASS_BITS); - // Largest allocation size representable by the uniform sizeclass encoding. - // Equals `from_exp_mant(NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1)`, - // which for the default config is `2 ^ ENCODED_ADDRESS_BITS`. Requests - // strictly larger than this cannot be encoded and must be failed before - // any call to `size_to_sizeclass_full`. + // Width of the per-chunk slab-offset field packed immediately above the + // sizeclass in `ras`. The worst-case slab count for any non-pow2 large + // class with `INTERMEDIATE_BITS = M` is `2^(M+1)`; `M + 1` bits cover + // the maximum index. `compute_max_large_slab_index` static_asserts the + // bound against the actual table below. + constexpr size_t OFFSET_BITS = INTERMEDIATE_BITS + 1; + + // `ras & COMBINED_MASK` directly indexes the `(sizeclass, offset)` table + // row, which already carries `offset_bytes = offset * slab_size`. + constexpr size_t COMBINED_BITS = SIZECLASS_BITS + OFFSET_BITS; + constexpr size_t COMBINED_REP_SIZE = bits::one_at_bit(COMBINED_BITS); + + // Largest size representable by the uniform sizeclass encoding; + // requests larger than this must be failed before + // `size_to_sizeclass_full`. constexpr size_t MAX_LARGE_SIZECLASS_SIZE = bits::from_exp_mant( NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES - 1); @@ -59,27 +57,21 @@ namespace snmalloc static_assert( MAX_LARGE_SIZECLASS_SIZE == bits::one_at_bit(ENCODED_ADDRESS_BITS), "MAX_LARGE_SIZECLASS_SIZE must equal 2 ^ ENCODED_ADDRESS_BITS; if this " - "fails, the exp+mantissa math no longer matches NUM_LARGE_CLASSES."); + "fails, the exp+mantissa math does not match NUM_LARGE_CLASSES."); static_assert( ENCODED_ADDRESS_BITS > MAX_SMALL_SIZECLASS_BITS, "ENCODED_ADDRESS_BITS must exceed MAX_SMALL_SIZECLASS_BITS so the large " "range is non-empty."); /** - * Represents a sizeclass identifier shared by small and large allocations - * using a single uniform encoding: + * Sizeclass identifier shared by small and large allocations: * - * value == 0 : unmapped / default sentinel - * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small sizeclass sc = value - 1 - * value ∈ [1 + NUM_SMALL_SIZECLASSES, - * 1 + NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES) - * : large class lc = - * value - 1 - - * NUM_SMALL_SIZECLASSES + * value == 0 : sentinel (unmapped) + * value ∈ [1, 1 + NUM_SMALL_SIZECLASSES) : small, sc = value - 1 + * value ∈ [1 + NUM_SMALL_SIZECLASSES, ...): large * - * Used directly as an index into `sizeclass_metadata`. Slot 0 of that table - * is zero-padded so the sentinel can flow through the fast-path table - * lookups without a subtract on the hot path. + * Indexes `sizeclass_metadata` directly; slot 0 is zero-padded so the + * sentinel flows through fast-path lookups without a branch. */ class sizeclass_t { @@ -131,8 +123,7 @@ namespace snmalloc constexpr bool is_small() { - // Sentinel (value == 0) underflows to a large positive value, which - // also fails the comparison — the sentinel is therefore not small. + // Sentinel (value == 0) underflows past NUM_SMALL_SIZECLASSES. return (value - 1) < NUM_SMALL_SIZECLASSES; } @@ -152,12 +143,70 @@ namespace snmalloc } }; + /** + * (sizeclass, per-chunk slab offset) packed into the low `COMBINED_BITS` + * of a pagemap entry's `remote_and_sizeclass`. Non-zero offsets occur + * only for interior chunks of non-pow2 large allocations; the offset + * lets `start_of_object` recover the allocation base. + * + * Distinct from `sizeclass_t` so `is_small()` / `as_small()` / + * `as_large()` cannot be called on a value carrying offset bits, and so + * the offset can never be synthesised: constructing a value requires + * supplying both components explicitly, or going through `from_raw` + * with bits read from storage. + */ + class offset_and_sizeclass_t + { + size_t value{0}; + + constexpr offset_and_sizeclass_t(size_t value) : value(value) {} + + public: + constexpr offset_and_sizeclass_t() = default; + + constexpr offset_and_sizeclass_t(sizeclass_t sc, size_t offset) + : value(sc.raw() | (offset << SIZECLASS_BITS)) + { + SNMALLOC_ASSERT(offset < (size_t{1} << OFFSET_BITS)); + } + + static constexpr offset_and_sizeclass_t from_raw(size_t raw) + { + return {raw}; + } + + constexpr size_t raw() const + { + return value; + } + + constexpr sizeclass_t sizeclass() const + { + return sizeclass_t::from_raw(value & (SIZECLASS_REP_SIZE - 1)); + } + + constexpr size_t offset() const + { + return (value >> SIZECLASS_BITS) & ((size_t{1} << OFFSET_BITS) - 1); + } + + constexpr bool operator==(offset_and_sizeclass_t other) const + { + return value == other.value; + } + }; + using sizeclass_compress_t = uint8_t; /** - * This structure contains the fields required for fast paths for sizeclasses. + * Per-`offset_and_sizeclass_t` metadata for `start_of_object` — + * recovering the allocation base from an interior pointer. + * + * Sized to a power of two (4 × `size_t` = 32 bytes) so the table + * stride collapses to a single shift in the + * `__malloc_start_pointer` hot path. */ - struct sizeclass_data_fast + struct sizeclass_data_start { size_t size; // We store the mask as it is used more on the fast path, and the size of @@ -165,58 +214,106 @@ namespace snmalloc size_t slab_mask; // Table of constants for reciprocal division for each sizeclass. size_t div_mult; - // Table of constants for reciprocal modulus for each sizeclass. + // `offset * slab_size`, precomputed. Zero for `offset == 0` rows. + size_t offset_bytes; + }; + + static_assert( + sizeof(sizeclass_data_start) == 4 * sizeof(size_t), + "sizeclass_data_start must be a power-of-two stride for single-shift " + "indexing in start_of_object"); + + /** + * Per-`sizeclass_t` metadata for `is_start_of_object` — the + * Lemire-style alignment check used by check-build dealloc and + * debug asserts. + * + * `slab_mask` is duplicated here (also held in `sizeclass_data_start`) + * so the alignment check loads from a single row instead of straddling + * two tables. + */ + struct sizeclass_data_align + { + size_t slab_mask; size_t mod_zero_mult; }; /** - * This structure contains the remaining fields required for slow paths for - * sizeclasses. + * Per-`sizeclass_t` thresholds used when initialising a slab — + * cold-path data consumed at slab allocation/refill time. */ - struct sizeclass_data_slow + struct sizeclass_data_slab { uint16_t capacity; uint16_t waking; }; - static_assert(sizeof(sizeclass_data_slow::capacity) * 8 > MAX_CAPACITY_BITS); + static_assert(sizeof(sizeclass_data_slab::capacity) * 8 > MAX_CAPACITY_BITS); struct SizeClassTable { - ModArray fast_{}; - ModArray slow_{}; + // `start_` is indexed by an `offset_and_sizeclass_t` (Word::Two of + // the pagemap entry & COMBINED_MASK). The first SIZECLASS_REP_SIZE + // rows have offset == 0; subsequent rows carry the offset_bytes + // needed for `start_of_object` on non-pow2 large interior chunks. + ModArray start_{}; + ModArray align_{}; + ModArray slab_{}; size_t DIV_MULT_SHIFT{0}; - [[nodiscard]] constexpr sizeclass_data_fast& fast(sizeclass_t index) + [[nodiscard]] constexpr sizeclass_data_start& start(sizeclass_t index) { - return fast_[index.raw()]; + return start_[index.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast fast(sizeclass_t index) const + [[nodiscard]] constexpr sizeclass_data_start start(sizeclass_t index) const { - return fast_[index.raw()]; + return start_[index.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast& fast_small(smallsizeclass_t sc) + [[nodiscard]] constexpr sizeclass_data_start& + start(offset_and_sizeclass_t osc) { - return fast_[sizeclass_t::from_small_class(sc).raw()]; + return start_[osc.raw()]; } - [[nodiscard]] constexpr sizeclass_data_fast - fast_small(smallsizeclass_t sc) const + [[nodiscard]] constexpr sizeclass_data_start + start(offset_and_sizeclass_t osc) const { - return fast_[sizeclass_t::from_small_class(sc).raw()]; + return start_[osc.raw()]; } - [[nodiscard]] constexpr sizeclass_data_slow& slow(sizeclass_t index) + [[nodiscard]] constexpr sizeclass_data_start& + start_small(smallsizeclass_t sc) { - return slow_[index.raw()]; + return start_[sizeclass_t::from_small_class(sc).raw()]; } - [[nodiscard]] constexpr sizeclass_data_slow slow(sizeclass_t index) const + [[nodiscard]] constexpr sizeclass_data_start + start_small(smallsizeclass_t sc) const { - return slow_[index.raw()]; + return start_[sizeclass_t::from_small_class(sc).raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_align& align(sizeclass_t index) + { + return align_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_align align(sizeclass_t index) const + { + return align_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_slab& slab(sizeclass_t index) + { + return slab_[index.raw()]; + } + + [[nodiscard]] constexpr sizeclass_data_slab slab(sizeclass_t index) const + { + return slab_[index.raw()]; } constexpr SizeClassTable() @@ -237,7 +334,8 @@ namespace snmalloc for (smallsizeclass_t sizeclass(0); sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++) { - auto& meta = fast_small(sizeclass); + auto& meta = start_small(sizeclass); + auto sc = sizeclass_t::from_small_class(sizeclass); size_t rsize = bits::from_exp_mant( @@ -247,18 +345,19 @@ namespace snmalloc bits::next_pow2_bits_const(MIN_OBJECT_COUNT * rsize), MIN_CHUNK_BITS); meta.slab_mask = bits::mask_bits(slab_bits); + align(sc).slab_mask = meta.slab_mask; - auto& meta_slow = slow(sizeclass_t::from_small_class(sizeclass)); - meta_slow.capacity = + auto& meta_slab = slab(sc); + meta_slab.capacity = static_cast((meta.slab_mask + 1) / rsize); - meta_slow.waking = mitigations(random_larger_thresholds) ? - static_cast(meta_slow.capacity / 4) : - static_cast(bits::min((meta_slow.capacity / 4), 32)); + meta_slab.waking = mitigations(random_larger_thresholds) ? + static_cast(meta_slab.capacity / 4) : + static_cast(bits::min((meta_slab.capacity / 4), 32)); - if (meta_slow.capacity > max_capacity) + if (meta_slab.capacity > max_capacity) { - max_capacity = meta_slow.capacity; + max_capacity = meta_slab.capacity; } } @@ -269,75 +368,113 @@ namespace snmalloc sizeclass++) { // Calculate reciprocal division constant. - auto& meta = fast_small(sizeclass); + auto& meta = start_small(sizeclass); meta.div_mult = (bits::mask_bits(DIV_MULT_SHIFT) / meta.size) + 1; size_t zero = 0; - meta.mod_zero_mult = (~zero / meta.size) + 1; + align(sizeclass_t::from_small_class(sizeclass)).mod_zero_mult = + (~zero / meta.size) + 1; } for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) { auto lsc = sizeclass_t::from_large_class(lc); - auto& meta = fast(lsc); - // Continuous global exp+mantissa scheme: small classes occupy - // global indices [0, NUM_SMALL_SIZECLASSES); large classes occupy - // [NUM_SMALL_SIZECLASSES, NUM_SMALL_SIZECLASSES + NUM_LARGE_CLASSES). + auto& meta = start(lsc); size_t size = bits::from_exp_mant( NUM_SMALL_SIZECLASSES + lc); meta.size = size; - // Natural alignment of the size: the largest power of two that - // divides `size`. For pow2 sizes, this equals `size`; for non-pow2 - // mantissa steps it is the slab granularity at which the allocation - // tiles. `slab_mask = align - 1`. - size_t align = size & (~size + 1); - meta.slab_mask = align - 1; - // The slab_mask will do all the necessary work, so - // perform identity multiplication for the test. - meta.mod_zero_mult = 1; - // The slab_mask will do all the necessary work for division - // so collapse the calculated offset. + // `slab_mask = (natural alignment of size) - 1`; for pow2 sizes + // this equals size - 1, for non-pow2 mantissa steps it is the + // slab granularity at which the allocation tiles. + size_t align_bytes = size & (~size + 1); + meta.slab_mask = align_bytes - 1; + align(lsc).slab_mask = meta.slab_mask; + // slab_mask handles the math; identity values neutralise the + // mod/div reciprocals. + align(lsc).mod_zero_mult = 1; meta.div_mult = 0; } + + // Populate offset > 0 rows: same as the (sc, 0) row but with + // `offset_bytes = offset * slab_size` so that `start_of_object` + // collapses to `(addr & ~slab_mask) - offset_bytes`. Read when + // the backend writes per-chunk offsets for multi-slab-tile + // reservations. + for (size_t sc_raw = 0; sc_raw < SIZECLASS_REP_SIZE; sc_raw++) + { + const auto& base = start_[sc_raw]; + const size_t slab_size = base.slab_mask + 1; + for (size_t offset = 1; offset < (size_t{1} << OFFSET_BITS); offset++) + { + auto& row = start_[sc_raw | (offset << SIZECLASS_BITS)]; + row.size = base.size; + row.slab_mask = base.slab_mask; + row.div_mult = base.div_mult; + row.offset_bytes = offset * slab_size; + } + } } }; constexpr SizeClassTable sizeclass_metadata = SizeClassTable(); - // Slot 0 of `sizeclass_metadata` is the unmapped sentinel; it must remain - // zero-initialised so fast-path lookups via `fast(sc)` return zero size - // and slab_mask without needing a sentinel check before indexing. + // Sentinel must remain zero-initialised so fast-path lookups via + // `start(sc)` return zero size without a branch. Slab_mask is + // `~size_t(0)` so foreign-pointer `remaining_bytes` underflows to a + // huge value (see `SizeClassTable::SizeClassTable`). static_assert( - sizeclass_metadata.fast(sizeclass_t{}).size == 0, + sizeclass_metadata.start(sizeclass_t{}).size == 0, "sentinel slot must have size 0"); static_assert( - sizeclass_metadata.fast(sizeclass_t{}).slab_mask == 0, - "sentinel slot must have slab_mask 0"); + sizeclass_metadata.start(sizeclass_t{}).slab_mask == ~size_t(0), + "sentinel slot must have slab_mask ~0 for foreign-pointer " + "remaining_bytes underflow"); static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); + // Largest slab index for any large class: `OFFSET_BITS` must cover it. + constexpr size_t compute_max_large_slab_index() + { + size_t max_idx = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + const auto& meta = + sizeclass_metadata.start(sizeclass_t::from_large_class(lc)); + const size_t slab_size = meta.slab_mask + 1; + const size_t reserve = bits::next_pow2_const(meta.size); + const size_t idx = (reserve / slab_size) - 1; + if (idx > max_idx) + max_idx = idx; + } + return max_idx; + } + + static_assert( + compute_max_large_slab_index() < (size_t{1} << OFFSET_BITS), + "OFFSET_BITS must cover the worst-case slab index for any large class"); + constexpr size_t DIV_MULT_SHIFT = sizeclass_metadata.DIV_MULT_SHIFT; constexpr size_t sizeclass_to_size(smallsizeclass_t sizeclass) { - return sizeclass_metadata.fast_small(sizeclass).size; + return sizeclass_metadata.start_small(sizeclass).size; } constexpr size_t sizeclass_full_to_size(sizeclass_t sizeclass) { - return sizeclass_metadata.fast(sizeclass).size; + return sizeclass_metadata.start(sizeclass).size; } constexpr size_t sizeclass_full_to_slab_size(sizeclass_t sizeclass) { - return sizeclass_metadata.fast(sizeclass).slab_mask + 1; + return sizeclass_metadata.start(sizeclass).slab_mask + 1; } constexpr size_t sizeclass_to_slab_size(smallsizeclass_t sizeclass) { - return sizeclass_metadata.fast_small(sizeclass).slab_mask + 1; + return sizeclass_metadata.start_small(sizeclass).slab_mask + 1; } /** @@ -349,7 +486,7 @@ namespace snmalloc */ constexpr uint16_t threshold_for_waking_slab(smallsizeclass_t sizeclass) { - return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) + return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass)) .waking; } @@ -367,13 +504,14 @@ namespace snmalloc constexpr uint16_t sizeclass_to_slab_object_count(smallsizeclass_t sizeclass) { - return sizeclass_metadata.slow(sizeclass_t::from_small_class(sizeclass)) + return sizeclass_metadata.slab(sizeclass_t::from_small_class(sizeclass)) .capacity; } - SNMALLOC_FAST_PATH constexpr size_t slab_index(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + slab_index(offset_and_sizeclass_t osc, address_t addr) { - auto meta = sizeclass_metadata.fast(sc); + auto meta = sizeclass_metadata.start(osc); size_t offset = addr & meta.slab_mask; if constexpr (sizeof(offset) >= 8) { @@ -398,29 +536,54 @@ namespace snmalloc } } + /** + * Recover the start address of the allocation containing `addr`. + * + * Branch on `osc.offset() == 0` (testable from bits already loaded + * into `osc.raw()`, before any metadata-table access). The common + * case skips the `offset_bytes` field load and four extra arithmetic + * insns; the slow arm handles non-pow2 large interior chunks where + * the slab base must be shifted back to the allocation base. + */ SNMALLOC_FAST_PATH constexpr address_t - start_of_object(sizeclass_t sc, address_t addr) + start_of_object(offset_and_sizeclass_t osc, address_t addr) { - auto meta = sizeclass_metadata.fast(sc); - address_t slab_start = addr & ~meta.slab_mask; - size_t index = slab_index(sc, addr); - return slab_start + (index * meta.size); + auto meta = sizeclass_metadata.start(osc); + if (SNMALLOC_LIKELY(osc.offset() == 0)) + { + address_t slab_base = addr & ~meta.slab_mask; + size_t index = slab_index(osc, addr); + return slab_base + (index * meta.size); + } + address_t alloc_start = (addr & ~meta.slab_mask) - meta.offset_bytes; + size_t index = slab_index(osc, addr - alloc_start); + return alloc_start + (index * meta.size); } - constexpr size_t index_in_object(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + index_in_object(offset_and_sizeclass_t osc, address_t addr) { - return addr - start_of_object(sc, addr); + return addr - start_of_object(osc, addr); } - constexpr size_t remaining_bytes(sizeclass_t sc, address_t addr) + SNMALLOC_FAST_PATH constexpr size_t + remaining_bytes(offset_and_sizeclass_t osc, address_t addr) { - return sizeclass_metadata.fast(sc).size - index_in_object(sc, addr); + return sizeclass_metadata.start(osc).size - index_in_object(osc, addr); } + /** + * True iff `addr` is correctly aligned for an object of this + * sizeclass within its slab. Does NOT check whether `addr` lies in + * the first slab tile of a non-pow2 large allocation; callers that + * could be looking at an interior chunk must read the + * `offset_and_sizeclass_t` from the pagemap and use that overload + * instead. + */ constexpr bool is_start_of_object(sizeclass_t sc, address_t addr) { - size_t offset = addr & (sizeclass_full_to_slab_size(sc) - 1); - + auto meta = sizeclass_metadata.align(sc); + size_t offset = addr & meta.slab_mask; // Only works up to certain offsets, exhaustively tested by rounding.cc if constexpr (sizeof(offset) >= 8) { @@ -428,8 +591,7 @@ namespace snmalloc // 32bit. // This is based on: // https://lemire.me/blog/2019/02/20/more-fun-with-fast-remainders-when-the-divisor-is-a-constant/ - auto mod_zero_mult = sizeclass_metadata.fast(sc).mod_zero_mult; - return (offset * mod_zero_mult) < mod_zero_mult; + return (offset * meta.mod_zero_mult) < meta.mod_zero_mult; } else // Use 32-bit division as considerably faster than 64-bit, and @@ -437,12 +599,24 @@ namespace snmalloc return static_cast(offset % sizeclass_full_to_size(sc)) == 0; } + /** + * True iff `addr` is the start of an object. Interior chunks of + * non-pow2 large allocations carry `offset_bytes != 0`; only the + * first slab tile holds an allocation base, so a non-zero + * `offset_bytes` short-circuits to false. + */ + constexpr bool is_start_of_object(offset_and_sizeclass_t osc, address_t addr) + { + if (sizeclass_metadata.start(osc).offset_bytes != 0) + return false; + return is_start_of_object(osc.sizeclass(), addr); + } + inline static size_t large_size_to_chunk_size(size_t size) { return bits::next_pow2(size); } - constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { // We subtract and shift to reduce the size of the table, i.e. we don't have @@ -476,7 +650,7 @@ namespace snmalloc for (; sizeclass < minimum_class; sizeclass++) { for (; curr <= - sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size; + sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size; curr += MIN_ALLOC_STEP_SIZE) { table[sizeclass_lookup_index(curr)] = minimum_class; @@ -486,7 +660,7 @@ namespace snmalloc for (; sizeclass < NUM_SMALL_SIZECLASSES; sizeclass++) { for (; curr <= - sizeclass_metadata.fast_small(smallsizeclass_t(sizeclass)).size; + sizeclass_metadata.start_small(smallsizeclass_t(sizeclass)).size; curr += MIN_ALLOC_STEP_SIZE) { auto i = sizeclass_lookup_index(curr); @@ -516,17 +690,8 @@ namespace snmalloc } /** - * Maps a requested size to its sizeclass. The result uses the unified - * encoding documented on `sizeclass_t`. - * - * For small sizes, this delegates to `size_to_sizeclass`. For large - * sizes in Phase 13, this rounds up to the next power of two (the - * front end still requests pow2-rounded reservations); Phase 15 - * removes the `next_pow2` call to enable non-pow2 large reservations. - * - * `to_exp_mant` is the literal inverse of the `from_exp_mant` used - * when populating `sizeclass_metadata`, so this never indexes the - * wrong slot. + * Map a requested size to its sizeclass. Large requests are rounded up + * to the next power of two. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { diff --git a/src/snmalloc/global/globalalloc.h b/src/snmalloc/global/globalalloc.h index eb20210fc..1d7f05a18 100644 --- a/src/snmalloc/global/globalalloc.h +++ b/src/snmalloc/global/globalalloc.h @@ -138,9 +138,7 @@ namespace snmalloc size_t SNMALLOC_FAST_PATH_INLINE remaining_bytes(address_t p) { const auto& entry = Config_::Backend::template get_metaentry(p); - - auto sizeclass = entry.get_sizeclass(); - return snmalloc::remaining_bytes(sizeclass, p); + return snmalloc::remaining_bytes(entry.get_offset_and_sizeclass(), p); } template @@ -159,9 +157,7 @@ namespace snmalloc static inline size_t index_in_object(address_t p) { const auto& entry = Config_::Backend::template get_metaentry(p); - - auto sizeclass = entry.get_sizeclass(); - return snmalloc::index_in_object(sizeclass, p); + return snmalloc::index_in_object(entry.get_offset_and_sizeclass(), p); } enum Boundary @@ -230,7 +226,8 @@ namespace snmalloc { const auto& entry = Config_::Backend::get_metaentry(address_cast(p)); - size_t index = slab_index(entry.get_sizeclass(), address_cast(p)); + size_t index = + slab_index(entry.get_offset_and_sizeclass(), address_cast(p)); auto* meta_slab = entry.get_slab_metadata(); @@ -259,7 +256,8 @@ namespace snmalloc const auto& entry = Config_::Backend::template get_metaentry(address_cast(p)); - size_t index = slab_index(entry.get_sizeclass(), address_cast(p)); + size_t index = + slab_index(entry.get_offset_and_sizeclass(), address_cast(p)); auto* meta_slab = entry.get_slab_metadata(); diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index 10482b6b7..fa5f1389c 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -533,7 +533,7 @@ namespace snmalloc snmalloc_check_client( mitigations(sanity_checks), - is_start_of_object(entry.get_sizeclass(), address_cast(msg)), + is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(msg)), "Not deallocating start of an object"); size_t objsize = sizeclass_full_to_size(entry.get_sizeclass()); @@ -1080,7 +1080,7 @@ namespace snmalloc snmalloc_check_client( mitigations(sanity_checks), - is_start_of_object(entry.get_sizeclass(), address_cast(p)), + is_start_of_object(entry.get_offset_and_sizeclass(), address_cast(p)), "Not deallocating start of an object"); auto cp = p.as_static>(); diff --git a/src/snmalloc/mem/metadata.h b/src/snmalloc/mem/metadata.h index 3f65f39b6..cfc13755e 100644 --- a/src/snmalloc/mem/metadata.h +++ b/src/snmalloc/mem/metadata.h @@ -9,12 +9,11 @@ namespace snmalloc struct RemoteAllocator; /** - * Remotes need to be aligned enough that the bottom bits have enough room for - * all the size classes, both large and small. An additional bit is required - * to separate backend uses. + * RemoteAllocator pointers must have their low `COMBINED_BITS` zero + * so the (sizeclass, offset) field can be OR-ed in by `encode`. */ static constexpr size_t REMOTE_MIN_ALIGN = - bits::max(CACHELINE_SIZE, SIZECLASS_REP_SIZE) << 1; + bits::max(CACHELINE_SIZE, COMBINED_REP_SIZE); /** * Base class for the templated FrontendMetaEntry. This exists to avoid @@ -33,19 +32,18 @@ namespace snmalloc { protected: /** - * This bit is set in remote_and_sizeclass to discriminate between the case - * that it is in use by the frontend (0) or by the backend (1). For the - * former case, see other methods on this and the subclass - * `FrontendMetaEntry`; for the latter, see backend/backend.h and - * backend/largebuddyrange.h. - * - * This value is statically checked by the frontend to ensure that its - * bit packing does not conflict; see mem/remoteallocator.h. The marker - * tracks the sizeclass-encoding width (see `SIZECLASS_REP_SIZE` in - * ds/sizeclasstable.h): it must sit immediately above the highest bit - * used by a sizeclass raw value. + * Low bits of `remote_and_sizeclass` holding the sizeclass alone. + */ + static constexpr address_t SIZECLASS_MASK = SIZECLASS_REP_SIZE - 1; + + /** + * Low bits of `remote_and_sizeclass` holding the (sizeclass, offset) + * pair. Also the markerless ownership discriminator: + * `(ras & COMBINED_MASK) == 0` iff the entry is NOT in active + * frontend use (frontend entries always have sizeclass != 0; slot 0 + * is the unmapped sentinel). */ - static constexpr address_t REMOTE_BACKEND_MARKER = SIZECLASS_REP_SIZE; + static constexpr address_t COMBINED_MASK = COMBINED_REP_SIZE - 1; /** * Bit used to indicate this should not be considered part of the previous @@ -59,14 +57,12 @@ namespace snmalloc static constexpr address_t META_BOUNDARY_BIT = 1 << 0; /** - * The bit above the sizeclass is always zero unless this is used - * by the backend to represent another datastructure such as the buddy - * allocator entries. + * Alignment used by `get_remote` to mask off the (sizeclass, offset) + * bits and recover the `RemoteAllocator*` payload. */ static constexpr size_t REMOTE_WITH_BACKEND_MARKER_ALIGN = - MetaEntryBase::REMOTE_BACKEND_MARKER; - static_assert( - (REMOTE_MIN_ALIGN >> 1) == MetaEntryBase::REMOTE_BACKEND_MARKER); + COMBINED_REP_SIZE; + static_assert(REMOTE_MIN_ALIGN >= COMBINED_REP_SIZE); /** * In common cases, the pointer to the slab metadata. See @@ -98,42 +94,38 @@ namespace snmalloc constexpr MetaEntryBase() : MetaEntryBase(0, 0) {} /** - * When a meta entry is in use by the back end, it exposes two words of - * state. The low bits in both are reserved. Bits in this bitmask must - * not be set by the back end in either word. - * - * During a major release, this constraint may be weakened, allowing the - * back end to set more bits. We don't currently use all of these bits in - * both words, but we reserve them all to make access uniform. If more - * bits are required by a back end then we could make this asymmetric. + * Per-word frontend-reserved masks. Bits in these masks are owned by + * the frontend; the backend must preserve them on writes (enforced + * by `BackendStateWordRef::operator=`). * - * `REMOTE_BACKEND_MARKER` is the highest bit that we reserve, so this is - * currently every bit including that bit and all lower bits. + * - Word::One reserves `META_BOUNDARY_BIT` so PAL-allocation + * boundaries survive ownership transitions. + * - Word::Two reserves `COMBINED_MASK`; the markerless ownership + * discriminator requires these bits to be zero in backend mode, + * and backend writes here are chunk-aligned so the requirement + * is naturally satisfied. */ - static constexpr address_t BACKEND_RESERVED_MASK = - (REMOTE_BACKEND_MARKER << 1) - 1; + static constexpr address_t BACKEND_RESERVED_MASK_WORD_ONE = + META_BOUNDARY_BIT; + static constexpr address_t BACKEND_RESERVED_MASK_WORD_TWO = COMBINED_MASK; public: /** - * Bit position of the first bit available to backend metadata layouts - * above the reserved region. The reserved region runs from bit 0 up to - * and including the `REMOTE_BACKEND_MARKER` bit; layouts in - * `largearenarange.h` and `largebuddyrange.h` derive their bit - * positions (RED_BIT, VARIANT_SHIFT, LARGE_SIZE_SHIFT, ...) from this. + * First bit on Word::One available for backend layouts; the bits + * below are frontend-reserved. Backends in `largearenarange.h` + * derive `RED_BIT`, `VARIANT_SHIFT`, etc. from this. */ - static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = - bits::next_pow2_bits_const(REMOTE_BACKEND_MARKER) + 1; + static constexpr size_t BACKEND_LAYOUT_FIRST_FREE_BIT = 1; /** - * Does the back end currently own this entry? Note that freshly - * allocated entries are owned by the front end until explicitly - * claimed by the back end and so this will return `false` if neither - * the front nor back end owns this entry. + * True iff this entry is not in active frontend use (backend-claimed + * or untouched). Frontend entries always have `sizeclass != 0` + * (slot 0 is the unmapped sentinel), so the discriminator + * `(ras & COMBINED_MASK) == 0` distinguishes them. */ [[nodiscard]] bool is_backend_owned() const { - return (REMOTE_BACKEND_MARKER & remote_and_sizeclass) == - REMOTE_BACKEND_MARKER; + return (remote_and_sizeclass & COMBINED_MASK) == 0; } /** @@ -147,14 +139,19 @@ namespace snmalloc } /** - * Encode the remote and the sizeclass. + * Pack `remote`, `sizeclass`, and the per-chunk slab offset into a + * `remote_and_sizeclass` word. `offset` defaults to 0; the backend's + * multi-slab-tile write loop in `alloc_chunk` overrides it with the + * chunk's slab index so `start_of_object` can recover the + * allocation base. */ [[nodiscard]] static SNMALLOC_FAST_PATH uintptr_t - encode(RemoteAllocator* remote, sizeclass_t sizeclass) + encode(RemoteAllocator* remote, sizeclass_t sizeclass, size_t offset = 0) { /* remote might be nullptr; cast to uintptr_t before offsetting */ return pointer_offset( - reinterpret_cast(remote), sizeclass.raw()); + reinterpret_cast(remote), + offset_and_sizeclass_t(sizeclass, offset).raw()); } /** @@ -211,14 +208,14 @@ namespace snmalloc ///@} /** - * Returns the remote. - * - * If the meta entry is owned by the back end then this returns an - * undefined value and will abort in debug builds. + * Return the `RemoteAllocator*` payload by masking off the low + * `COMBINED_BITS`. Callable in any state: for unowned entries + * yields nullptr; for backend-owned entries yields a chunk address + * which compares unequal to any allocator's `public_state()`, so + * dispatch falls through to the slow path. */ [[nodiscard]] SNMALLOC_FAST_PATH RemoteAllocator* get_remote() const { - SNMALLOC_ASSERT(!is_backend_owned()); return reinterpret_cast( pointer_align_down( get_remote_and_sizeclass())); @@ -246,19 +243,31 @@ namespace snmalloc // TODO: perhaps remove static_cast with resolution of // https://github.com/CTSRD-CHERI/llvm-project/issues/588 return sizeclass_t::from_raw( - static_cast(get_remote_and_sizeclass()) & - (REMOTE_WITH_BACKEND_MARKER_ALIGN - 1)); + static_cast(get_remote_and_sizeclass()) & SIZECLASS_MASK); + } + + /** + * Return the (sizeclass, slab offset) pair indexing + * `sizeclass_metadata.start_`. The selected row carries + * `offset_bytes = offset * slab_size` precomputed, so + * `start_of_object` recovers the allocation base with a single + * subtract. + */ + [[nodiscard]] SNMALLOC_FAST_PATH offset_and_sizeclass_t + get_offset_and_sizeclass() const + { + return offset_and_sizeclass_t::from_raw( + static_cast(get_remote_and_sizeclass()) & COMBINED_MASK); } /** - * Claim the meta entry for use by the back end. This preserves the - * boundary bit, if it is set, but otherwise resets the meta entry to a - * pristine state. + * Claim the meta entry for the backend: preserves the boundary bit + * and zeros `remote_and_sizeclass` so `is_backend_owned()` holds. */ void claim_for_backend() { meta = is_boundary() ? META_BOUNDARY_BIT : 0; - remote_and_sizeclass = REMOTE_BACKEND_MARKER; + remote_and_sizeclass = 0; } /** @@ -279,9 +288,11 @@ namespace snmalloc Two }; - static constexpr bool is_backend_allowed_value(Word, uintptr_t val) + static constexpr bool is_backend_allowed_value(Word w, uintptr_t val) { - return (val & BACKEND_RESERVED_MASK) == 0; + const address_t mask = (w == Word::One) ? BACKEND_RESERVED_MASK_WORD_ONE : + BACKEND_RESERVED_MASK_WORD_TWO; + return (val & mask) == 0; } /** @@ -298,6 +309,14 @@ namespace snmalloc */ uintptr_t* val; + /** + * The frontend-reserved mask for the word that `val` points at. Bits + * in this mask are owned by the frontend: `get()` clears them on + * read, and `operator=` preserves them on write (by OR-ing the + * current value's masked bits into the new value). + */ + address_t reserved_mask{0}; + public: /** * Uninitialised constructor. @@ -305,9 +324,24 @@ namespace snmalloc BackendStateWordRef() = default; /** - * Constructor, wraps a `uintptr_t`. Note that this may be used outside - * of the meta entry by code wishing to provide uniform storage to things - * that are either in a meta entry or elsewhere. + * Constructor, wraps a `uintptr_t` and the frontend-reserved mask + * that applies to that word. Note that this may be used outside of + * the meta entry by code wishing to provide uniform storage to + * things that are either in a meta entry or elsewhere. + */ + constexpr BackendStateWordRef(uintptr_t* v, address_t mask) + : val(v), reserved_mask(mask) + {} + + /** + * Single-pointer constructor required by the `RBRepMethods` + * concept, which constructs a Handle from `&Rep::root` to + * verify sentinel constructibility (see + * `ds_core/redblacktree.h`). Reserved mask is zero, which is + * safe because `Rep::root` is a `static const` sentinel that + * the red-black tree never assigns through — any write would + * trap on the const data — and on read the underlying value is + * zero so `get()` returns zero regardless of the mask. */ constexpr BackendStateWordRef(uintptr_t* v) : val(v) {} @@ -325,7 +359,7 @@ namespace snmalloc */ [[nodiscard]] uintptr_t get() const { - return (*val) & ~BACKEND_RESERVED_MASK; + return (*val) & ~reserved_mask; } /** @@ -343,13 +377,13 @@ namespace snmalloc BackendStateWordRef& operator=(uintptr_t v) { SNMALLOC_ASSERT_MSG( - ((v & BACKEND_RESERVED_MASK) == 0), - "The back end is not permitted to use the low bits in the meta " - "entry. ({} & {}) == {}.", + ((v & reserved_mask) == 0), + "The back end is not permitted to use the reserved bits in the " + "meta entry. ({} & {}) == {}.", v, - BACKEND_RESERVED_MASK, - (v & BACKEND_RESERVED_MASK)); - *val = v | (static_cast(*val) & BACKEND_RESERVED_MASK); + reserved_mask, + (v & reserved_mask)); + *val = v | (static_cast(*val) & reserved_mask); return *this; } @@ -389,7 +423,10 @@ namespace snmalloc remote_and_sizeclass); claim_for_backend(); } - return {w == Word::One ? &meta : &remote_and_sizeclass}; + return (w == Word::One) ? + BackendStateWordRef{&meta, BACKEND_RESERVED_MASK_WORD_ONE} : + BackendStateWordRef{ + &remote_and_sizeclass, BACKEND_RESERVED_MASK_WORD_TWO}; } }; @@ -756,14 +793,7 @@ namespace snmalloc SNMALLOC_FAST_PATH FrontendMetaEntry(SlabMetadata* meta, uintptr_t remote_and_sizeclass) : MetaEntryBase(unsafe_to_uintptr(meta), remote_and_sizeclass) - { - SNMALLOC_ASSERT_MSG( - (REMOTE_BACKEND_MARKER & remote_and_sizeclass) == 0, - "Setting a backend-owned value ({}) via the front-end interface is not " - "allowed", - remote_and_sizeclass); - remote_and_sizeclass &= ~REMOTE_BACKEND_MARKER; - } + {} /** * Implicit copying of meta entries is almost certainly a bug and so the @@ -782,13 +812,13 @@ namespace snmalloc } /** - * Return the FrontendSlabMetadata metadata associated with this chunk, - * guarded by an assert that this chunk is being used as a slab (i.e., has - * an associated owning allocator). + * Return the FrontendSlabMetadata pointer. Only meaningful when the + * entry is frontend-owned; in other states the underlying word + * holds tree-node fields. Callers must verify ownership first + * (the standard idiom is `entry.get_remote() == self->public_state()`). */ [[nodiscard]] SNMALLOC_FAST_PATH SlabMetadata* get_slab_metadata() const { - SNMALLOC_ASSERT(!is_backend_owned()); return unsafe_from_uintptr(meta & ~META_BOUNDARY_BIT); } }; diff --git a/src/snmalloc/override/rust.cc b/src/snmalloc/override/rust.cc index d2e7e2e08..86ac6f5f8 100644 --- a/src/snmalloc/override/rust.cc +++ b/src/snmalloc/override/rust.cc @@ -41,8 +41,8 @@ extern "C" SNMALLOC_EXPORT void* SNMALLOC_NAME_MANGLE(rust_realloc)( if ( aligned_old_size <= MAX_LARGE_SIZECLASS_SIZE && aligned_new_size <= MAX_LARGE_SIZECLASS_SIZE && - size_to_sizeclass_full(aligned_old_size).raw() == - size_to_sizeclass_full(aligned_new_size).raw()) + size_to_sizeclass_full(aligned_old_size) == + size_to_sizeclass_full(aligned_new_size)) return ptr; void* p = alloc(aligned_new_size); if (p) diff --git a/src/test/func/arena/arena.cc b/src/test/func/arena/arena.cc index 6d6a18a21..9ccb83099 100644 --- a/src/test/func/arena/arena.cc +++ b/src/test/func/arena/arena.cc @@ -1455,7 +1455,7 @@ namespace snmalloc // tree-membership tests gate the can_consolidate read. MockRep's // can_consolidate now dereferences mock_store via mock_index, which // asserts on out-of-range indices, so an unguarded probe in - // add_block trips here rather than only as a segfault in production + // add_block trips here rather than only as a segfault in release // builds. static void test_block_at_arena_top_edge() { diff --git a/src/test/func/arenabins/arenabins.cc b/src/test/func/arenabins/arenabins.cc index 05f1ee308..65e24ba37 100644 --- a/src/test/func/arenabins/arenabins.cc +++ b/src/test/func/arenabins/arenabins.cc @@ -33,8 +33,8 @@ namespace snmalloc * Friend struct exposing private internals of * `ArenaBins` (and its nested `Bitmap`) * for unit tests. Forward-declared in `arenabins.h`; - * defined here so the production header carries no test-only - * surface. + * defined here to keep the test-access implementation out of the + * in-tree header. */ template struct ArenaBinsTestAccess @@ -82,7 +82,7 @@ namespace snmalloc // --- Raw size-class id access --- // // The bin scheme assigns a dense raw id in `[0, MAX_SC)` to each - // size class. Production code never names these (the fast path + // size class. In-tree callers never name these (the fast path // goes straight from request size to the bitmap-scan / carve // record). Tests cross-check the encoding via the helpers below; // the alias `sc_t = size_t` preserves the existing test diff --git a/src/test/func/large_offset/large_offset.cc b/src/test/func/large_offset/large_offset.cc new file mode 100644 index 000000000..e7c45c246 --- /dev/null +++ b/src/test/func/large_offset/large_offset.cc @@ -0,0 +1,225 @@ +/** + * Targeted test for the per-chunk pagemap offset write path in + * `BackendAllocator::alloc_chunk`. + * + * The front end currently only issues pow2 large requests (the + * `slab_size >= size` fast path), so the multi-slab-tile branch in + * `alloc_chunk` writing per-chunk offsets is otherwise unreachable + * from the in-tree allocation paths. This test reaches it via the + * public backend API. + * + * Method: + * - Pick a non-pow2 large sizeclass `sc` whose + * `sizeclass_full_to_slab_size(sc) < sizeclass_full_to_size(sc)`, + * so the multi-slab-tile branch triggers. + * - Compute the pow2 reservation `next_pow2(size)` (the size + * `alloc_chunk` asserts). + * - Call `Config::Backend::alloc_chunk` directly with that pow2 size + * and the non-pow2 sc. + * - For each chunk in the pow2 region verify the pagemap entry's + * `get_offset_and_sizeclass()` decomposes into the expected + * (sc, slab_index) pair. + * - For sampled interior addresses verify that + * `remaining_bytes` / `index_in_object` return positions within + * the logical allocation. + * - Verify `is_start_of_object` behaviour: true at the allocation + * base, false elsewhere. + * - `dealloc_chunk` and verify entries clear back to "not + * frontend-owned" (low COMBINED_BITS == 0). + */ + +#include "test/setup.h" + +#include +#include +#include + +#ifdef assert +# undef assert +#endif +#define assert please_use_SNMALLOC_ASSERT + +using namespace snmalloc; + +using CustomGlobals = FixedRangeConfig>; +using FixedAlloc = Allocator; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + /** + * Find the smallest non-pow2 large sizeclass: one where slab_size < + * size. Returns sizeclass_t{} (the unmapped sentinel) if none exists + * in this configuration. + */ + sizeclass_t find_non_pow2_large_sc() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + auto sc = sizeclass_t::from_large_class(lc); + const size_t size = sizeclass_full_to_size(sc); + const size_t slab_size = sizeclass_full_to_slab_size(sc); + if (slab_size < size) + return sc; + } + return sizeclass_t{}; + } + + void test_per_chunk_offset() + { + auto sc = find_non_pow2_large_sc(); + if (sc.raw() == 0) + { + std::cout << "No non-pow2 large sizeclass available in this config; " + "skipping per-chunk offset test." + << std::endl; + return; + } + const size_t size = sizeclass_full_to_size(sc); + const size_t slab_size = sizeclass_full_to_slab_size(sc); + const size_t reserve = bits::next_pow2(size); + + std::cout << "non-pow2 sc raw=" << sc.raw() << " size=" << size + << " slab_size=" << slab_size << " reserve=" << reserve + << std::endl; + + // Set up an isolated FixedRangeConfig allocator. FixedRangeConfig + // owns its own pagemap and never reclaims `region_base`; the + // reservation is released when the process exits. For a multi- + // test harness, explicit teardown would be required here. + const size_t region = bits::one_at_bit(28); + auto region_base = DefaultPal::reserve(region); + DefaultPal::notify_using(region_base, region); + CustomGlobals::init(nullptr, region_base, region); + + auto a = get_scoped_allocator(); + + using Backend = typename CustomGlobals::Backend; + using Entry = typename CustomGlobals::PagemapEntry; + + // Construct the encoded ras the way the front end does (offset=0). + const uintptr_t ras_in = Entry::encode(nullptr, sc); + + auto [chunk, slab_meta] = + Backend::alloc_chunk(a->get_backend_local_state(), reserve, ras_in, sc); + if (chunk == nullptr) + { + fail("alloc_chunk returned null"); + return; + } + + const address_t base = address_cast(chunk); + std::cout << "Allocated chunk base=" << reinterpret_cast(base) + << " reserve=" << reserve << std::endl; + + // Verify per-chunk pagemap entries. + for (size_t chunk_offset = 0; chunk_offset < reserve; + chunk_offset += MIN_CHUNK_SIZE) + { + const size_t expected_slab_index = chunk_offset / slab_size; + const auto& entry = Backend::get_metaentry(base + chunk_offset); + const offset_and_sizeclass_t osc = entry.get_offset_and_sizeclass(); + const offset_and_sizeclass_t expected_osc = + offset_and_sizeclass_t(sc, expected_slab_index); + if (!(osc == expected_osc)) + { + std::cout << "Chunk @+" << chunk_offset << " osc=" << osc.raw() + << " expected=" << expected_osc.raw() << " (sc=" << sc.raw() + << " idx=" << expected_slab_index << ")" << std::endl; + fail("offset_and_sizeclass mismatch"); + } + // The pure sizeclass mask must still report `sc`. + if (!(entry.get_sizeclass() == sc)) + { + std::cout << "Chunk @+" << chunk_offset << " get_sizeclass mismatch" + << std::endl; + fail("get_sizeclass mismatch on offset>0 chunk"); + } + } + + // For an interior address in each chunk that lies within the + // *logical* allocation (size, not the pow2 reservation), + // remaining_bytes / index_in_object should report position within + // the allocation. + for (size_t chunk_offset = 0; chunk_offset < size; + chunk_offset += MIN_CHUNK_SIZE) + { + const address_t addr = base + chunk_offset; + const size_t rem = snmalloc::remaining_bytes(addr); + if (rem != size - chunk_offset) + { + std::cout << "remaining_bytes @+" << chunk_offset << " = " << rem + << " expected " << (size - chunk_offset) << std::endl; + fail("remaining_bytes mismatch"); + } + const size_t idx = snmalloc::index_in_object(addr); + if (idx != chunk_offset) + { + std::cout << "index_in_object @+" << chunk_offset << " = " << idx + << " expected " << chunk_offset << std::endl; + fail("index_in_object mismatch"); + } + } + + // Direct is_start_of_object checks: the allocation base address + // must be a start-of-object; an interior address inside the first + // slab tile (offset_bytes == 0 in pagemap) but not at the base + // must NOT; and an address in any non-first slab tile + // (offset_bytes != 0 in pagemap) must NOT. + { + const auto& base_entry = Backend::get_metaentry(base); + if (!is_start_of_object(base_entry.get_offset_and_sizeclass(), base)) + fail("base address not reported as start-of-object"); + if (is_start_of_object(base_entry.get_offset_and_sizeclass(), base + 1)) + fail("base+1 incorrectly reported as start-of-object"); + } + if (size > slab_size) + { + const address_t second_slab = base + slab_size; + const auto& second_entry = Backend::get_metaentry(second_slab); + if (is_start_of_object( + second_entry.get_offset_and_sizeclass(), second_slab)) + fail("second slab tile base incorrectly reported as start-of-object"); + } + + // Tear down: dealloc the chunk and verify the per-chunk pagemap + // entries no longer report as frontend-owned. + auto alloc_cap = + capptr_chunk_is_alloc(capptr_to_user_address_control(chunk)); + Backend::dealloc_chunk( + a->get_backend_local_state(), *slab_meta, alloc_cap, reserve, sc); + + for (size_t chunk_offset = 0; chunk_offset < reserve; + chunk_offset += MIN_CHUNK_SIZE) + { + const auto& entry = Backend::get_metaentry(base + chunk_offset); + if (!entry.is_backend_owned()) + { + std::cout << "Chunk @+" << chunk_offset + << " not backend-owned after dealloc; osc=" + << entry.get_offset_and_sizeclass().raw() << std::endl; + fail("dealloc didn't reset per-chunk offset"); + } + } + } +} // namespace + +int main() +{ + setup(); + test_per_chunk_offset(); + if (any_failures) + { + std::cout << "FAILED" << std::endl; + return 1; + } + std::cout << "PASSED" << std::endl; + return 0; +} diff --git a/src/test/func/release-rounding/rounding.cc b/src/test/func/release-rounding/rounding.cc index 13155678a..d1c556ed9 100644 --- a/src/test/func/release-rounding/rounding.cc +++ b/src/test/func/release-rounding/rounding.cc @@ -18,18 +18,49 @@ int main(int argc, char** argv) bool failed = false; + // Layout invariant: osc(sc, off).raw() == sc.raw() | (off << SIZECLASS_BITS), + // and the accessors invert that layout. This is load-bearing because + // `SizeClassTable::start(sizeclass_t)` and `start(offset_and_sizeclass_t)` + // both index by `.raw()`, so an offset=0 osc must hit the same table + // row as the bare sizeclass_t; the offset>0 row-population loop in + // the SizeClassTable ctor relies on the same layout. If any of this + // drifts, `encode()` in metadata.h would silently produce wrong bits. + for (smallsizeclass_t sc_small; sc_small < NUM_SMALL_SIZECLASSES; sc_small++) + { + sizeclass_t sc = sizeclass_t::from_small_class(sc_small); + for (size_t off = 0; off < (size_t{1} << OFFSET_BITS); off++) + { + auto osc = offset_and_sizeclass_t(sc, off); + size_t expected_raw = sc.raw() | (off << SIZECLASS_BITS); + if ( + osc.raw() != expected_raw || osc.sizeclass() != sc || + osc.offset() != off) + { + std::cout << "osc layout mismatch: sc=" << sc.raw() << " off=" << off + << " -> raw=" << osc.raw() << " expected_raw=" << expected_raw + << " sc'=" << osc.sizeclass().raw() + << " off'=" << osc.offset() << std::endl + << std::flush; + failed = true; + } + } + } + if (failed) + abort(); + for (smallsizeclass_t size_class; size_class < NUM_SMALL_SIZECLASSES; size_class++) { size_t rsize = sizeclass_to_size(size_class); size_t max_offset = sizeclass_to_slab_size(size_class); sizeclass_t sc = sizeclass_t::from_small_class(size_class); + offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0); for (size_t offset = 0; offset < max_offset; offset++) { size_t mod = offset % rsize; bool mod_0 = (offset % rsize) == 0; - size_t opt_mod = index_in_object(sc, offset); + size_t opt_mod = index_in_object(osc, offset); if (mod != opt_mod) { std::cout << "rsize " << rsize << " offset " << offset << " opt " @@ -38,7 +69,7 @@ int main(int argc, char** argv) failed = true; } - bool opt_mod_0 = is_start_of_object(sc, offset); + bool opt_mod_0 = is_start_of_object(osc, offset); if (opt_mod_0 != mod_0) { std::cout << "rsize " << rsize << " offset " << offset @@ -63,6 +94,7 @@ int main(int argc, char** argv) { size_t S = bits::one_at_bit(b); sizeclass_t sc = size_to_sizeclass_full(S); + offset_and_sizeclass_t osc = offset_and_sizeclass_t(sc, 0); address_t base = address_t(0); size_t offsets[] = {0, 1, S / 2, S - 1, S}; @@ -72,7 +104,7 @@ int main(int argc, char** argv) size_t expected_mod = off % S; bool expected_start = expected_mod == 0; - size_t opt_mod = index_in_object(sc, addr); + size_t opt_mod = index_in_object(osc, addr); if (opt_mod != expected_mod) { std::cout << "Large S=" << S << " offset=" << off @@ -81,7 +113,7 @@ int main(int argc, char** argv) failed = true; } - bool opt_start = is_start_of_object(sc, addr); + bool opt_start = is_start_of_object(osc, addr); if (opt_start != expected_start) { std::cout << "Large S=" << S << " offset=" << off From 622930a37ca27fc06ed2cb9a728b962a77732186 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Thu, 4 Jun 2026 13:57:56 +0100 Subject: [PATCH 11/15] Front-end: request non-pow2 large allocations A request like malloc(70 KiB) at the default INTERMEDIATE_BITS = 2 now reserves the smallest enclosing exp+mantissa sizeclass (80 KiB) rather than next_pow2(size) (128 KiB). Sizes that already land on a class boundary reserve exactly that size; mid-exponent sizes shrink by up to ~33%. Mechanics: sizeclasstable.h - size_to_sizeclass_full drops next_pow2(size); to_exp_mant ceils directly to the smallest enclosing class. - round_size's large branch matches the reservation (sizeclass_full_to_size of the chosen class), so DefaultConts::success zeroes exactly the reservation for calloc. - large_size_to_chunk_size removed (the one caller in corealloc uses sizeclass_full_to_size(sc) directly with a hoisted sc). - compute_max_large_slab_index tightened to meta.size / slab_size - 1 (the actual worst case the runtime pagemap loop writes). backend.h - alloc_chunk's pow2 precondition relaxed to the slab-tile invariant: size is a positive multiple of slab_size. corealloc.h - large alloc path hoists size_to_sizeclass_full / chunk size into locals so each table lookup happens once. Tests: - large_offset_frontend/: new front-end counterpart to large_offset/. Exhaustively round-trips every large sizeclass and walks every chunk-aligned interior pointer for a boundary and a non-boundary request. - memory/: adds test_calloc_non_pow2_large as a calloc zeroing smoke test; clamps the end-of-stride probe in check_external_pointer_large since non-pow2 reservations are tighter than the next pow2. - sizeclass/: deterministic round_size gate over every large class (S maps to itself; S_prev+1 ceils to S). - large_offset/: backend test now passes the chunk-multiple reserve (= sizeclass_full_to_size(sc)) instead of next_pow2(size). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/backend.h | 21 +- src/snmalloc/ds/sizeclasstable.h | 35 ++-- src/snmalloc/mem/corealloc.h | 9 +- src/test/func/large_offset/large_offset.cc | 26 +-- .../large_offset_frontend.cc | 192 ++++++++++++++++++ src/test/func/memory/memory.cc | 53 ++++- src/test/func/sizeclass/sizeclass.cc | 78 +++++-- 7 files changed, 365 insertions(+), 49 deletions(-) create mode 100644 src/test/func/large_offset_frontend/large_offset_frontend.cc diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 80ff58da8..2fcdf2a57 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -92,7 +92,15 @@ namespace snmalloc uintptr_t ras, sizeclass_t sizeclass) { - SNMALLOC_ASSERT(bits::is_pow2(size)); + // `size` must be a positive multiple of the sizeclass's slab + // tile size: the pagemap loop below writes one entry per + // `slab_size` stride and must terminate exactly at `size`. + // Front-end callers satisfy this by construction because they + // pass `sizeclass_full_to_size(sizeclass)`, whose largest pow2 + // divisor is `sizeclass_full_to_slab_size(sizeclass)`. + const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); + SNMALLOC_ASSERT(size >= slab_size); + SNMALLOC_ASSERT((size & (slab_size - 1)) == 0); SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); // Calculate the extra bytes required to store the client meta-data. @@ -128,12 +136,11 @@ namespace snmalloc return {nullptr, nullptr}; } - const size_t slab_size = sizeclass_full_to_slab_size(sizeclass); - // `size` and `slab_size` are powers of two with `size >= slab_size`, - // so `size = k * slab_size` for some integer `k >= 1`. Each slab - // tile gets the same `ras_in | (slab_index << SIZECLASS_BITS)` - // entry, written in one `set_metaentry` call. - SNMALLOC_ASSERT(size >= slab_size); + // `slab_size` was computed and asserted against `size` at the + // top of `alloc_chunk`. `size = k * slab_size` for some integer + // `k >= 1`; each slab tile gets the same + // `ras | (slab_index << SIZECLASS_BITS)` entry, written in one + // `set_metaentry` call. // The OR below assumes the per-chunk-offset bits of `ras` are // zero; `MetaEntryBase::encode` defaults offset to 0, and the // backend is the only place per-chunk offsets are written. diff --git a/src/snmalloc/ds/sizeclasstable.h b/src/snmalloc/ds/sizeclasstable.h index ae4c0df3a..df173bef5 100644 --- a/src/snmalloc/ds/sizeclasstable.h +++ b/src/snmalloc/ds/sizeclasstable.h @@ -434,7 +434,11 @@ namespace snmalloc static_assert( bits::BITS - sizeclass_metadata.DIV_MULT_SHIFT <= MAX_CAPACITY_BITS); - // Largest slab index for any large class: `OFFSET_BITS` must cover it. + // Largest slab index for any large class: `OFFSET_BITS` must cover + // it. Each large allocation reserves exactly `meta.size` bytes (a + // positive multiple of `slab_size`), so the largest `slab_index` + // the pagemap loop in `Backend::alloc_chunk` writes is + // `meta.size / slab_size - 1`. constexpr size_t compute_max_large_slab_index() { size_t max_idx = 0; @@ -443,8 +447,7 @@ namespace snmalloc const auto& meta = sizeclass_metadata.start(sizeclass_t::from_large_class(lc)); const size_t slab_size = meta.slab_mask + 1; - const size_t reserve = bits::next_pow2_const(meta.size); - const size_t idx = (reserve / slab_size) - 1; + const size_t idx = (meta.size / slab_size) - 1; if (idx > max_idx) max_idx = idx; } @@ -612,11 +615,6 @@ namespace snmalloc return is_start_of_object(osc.sizeclass(), addr); } - inline static size_t large_size_to_chunk_size(size_t size) - { - return bits::next_pow2(size); - } - constexpr SNMALLOC_PURE size_t sizeclass_lookup_index(const size_t s) { // We subtract and shift to reduce the size of the table, i.e. we don't have @@ -690,8 +688,13 @@ namespace snmalloc } /** - * Map a requested size to its sizeclass. Large requests are rounded up - * to the next power of two. + * Map a requested size to its sizeclass. + * + * Small requests use the dense lookup table. Large requests are + * encoded with `to_exp_mant`, + * whose ceil semantic (`v = v - 1; ...`) selects the smallest + * sizeclass whose size is `>= size`. The raw `size` is passed in + * directly — the encoding does the rounding. */ static inline sizeclass_t size_to_sizeclass_full(size_t size) { @@ -701,9 +704,8 @@ namespace snmalloc } SNMALLOC_ASSERT(size != 0); SNMALLOC_ASSERT(size <= MAX_LARGE_SIZECLASS_SIZE); - size_t pow2 = bits::next_pow2(size); size_t global = - bits::to_exp_mant(pow2); + bits::to_exp_mant(size); return sizeclass_t::from_large_class(global - NUM_SMALL_SIZECLASSES); } @@ -730,7 +732,14 @@ namespace snmalloc // failed allocation later. return size; } - return bits::next_pow2(size); + // Large branch: round to the smallest enclosing exp+mantissa + // sizeclass. Must agree with `round_size`'s small-class branch in + // semantics: every request rounds to the smallest enclosing + // class. `DefaultConts::success` (corealloc.h) uses `round_size` + // to compute the `calloc` zeroing range, so any drift between + // the actual reservation and `round_size` would over- or + // under-zero. + return sizeclass_full_to_size(size_to_sizeclass_full(size)); } /// Returns the alignment that this size naturally has, that is diff --git a/src/snmalloc/mem/corealloc.h b/src/snmalloc/mem/corealloc.h index fa5f1389c..942b7f514 100644 --- a/src/snmalloc/mem/corealloc.h +++ b/src/snmalloc/mem/corealloc.h @@ -720,12 +720,13 @@ namespace snmalloc // Grab slab of correct size // Set remote as large allocator remote. + const auto sc = size_to_sizeclass_full(size); + const size_t chunk_size = sizeclass_full_to_size(sc); auto [chunk, meta] = Config::Backend::alloc_chunk( self->get_backend_local_state(), - large_size_to_chunk_size(size), - PagemapEntry::encode( - self->public_state(), size_to_sizeclass_full(size)), - size_to_sizeclass_full(size)); + chunk_size, + PagemapEntry::encode(self->public_state(), sc), + sc); #ifdef SNMALLOC_TRACING message<1024>( diff --git a/src/test/func/large_offset/large_offset.cc b/src/test/func/large_offset/large_offset.cc index e7c45c246..d89bf03ba 100644 --- a/src/test/func/large_offset/large_offset.cc +++ b/src/test/func/large_offset/large_offset.cc @@ -1,22 +1,21 @@ /** - * Targeted test for the per-chunk pagemap offset write path in - * `BackendAllocator::alloc_chunk`. + * Backend-API counterpart of `large_offset_frontend` for the per-chunk + * pagemap offset write path in `BackendAllocator::alloc_chunk`. * - * The front end currently only issues pow2 large requests (the - * `slab_size >= size` fast path), so the multi-slab-tile branch in - * `alloc_chunk` writing per-chunk offsets is otherwise unreachable - * from the in-tree allocation paths. This test reaches it via the - * public backend API. + * This test pins the contract at the *backend* boundary + * (`Config::Backend::alloc_chunk` / `dealloc_chunk`) so it holds + * independently of any front-end path: a non-pow2 large allocation + * spans multiple slab tiles, and `alloc_chunk` writes a per-chunk + * pagemap entry whose offset bits encode the slab index. * * Method: * - Pick a non-pow2 large sizeclass `sc` whose * `sizeclass_full_to_slab_size(sc) < sizeclass_full_to_size(sc)`, * so the multi-slab-tile branch triggers. - * - Compute the pow2 reservation `next_pow2(size)` (the size - * `alloc_chunk` asserts). - * - Call `Config::Backend::alloc_chunk` directly with that pow2 size + * - Call `Config::Backend::alloc_chunk` directly with + * `sizeclass_full_to_size(sc)` (the chunk-multiple reservation) * and the non-pow2 sc. - * - For each chunk in the pow2 region verify the pagemap entry's + * - For each chunk in the region verify the pagemap entry's * `get_offset_and_sizeclass()` decomposes into the expected * (sc, slab_index) pair. * - For sampled interior addresses verify that @@ -84,7 +83,10 @@ namespace } const size_t size = sizeclass_full_to_size(sc); const size_t slab_size = sizeclass_full_to_slab_size(sc); - const size_t reserve = bits::next_pow2(size); + // The chunk-multiple reservation: the backend precondition is + // that `size` is a positive multiple of `slab_size`, satisfied + // here by passing the exact sizeclass size. + const size_t reserve = size; std::cout << "non-pow2 sc raw=" << sc.raw() << " size=" << size << " slab_size=" << slab_size << " reserve=" << reserve diff --git a/src/test/func/large_offset_frontend/large_offset_frontend.cc b/src/test/func/large_offset_frontend/large_offset_frontend.cc new file mode 100644 index 000000000..4b4fd7948 --- /dev/null +++ b/src/test/func/large_offset_frontend/large_offset_frontend.cc @@ -0,0 +1,192 @@ +/** + * Front-end counterpart to `src/test/func/large_offset/`. + * + * The front-end allocates non-pow2 large allocations directly: + * `malloc(80 KiB)` reserves exactly 80 KiB (a sizeclass boundary) + * rather than rounding up to the next power of two. This test + * exercises the resulting per-chunk pagemap state via the public + * recovery API (`external_pointer`, `remaining_bytes`). + * + * `large_offset.cc` covers the same ground at the backend boundary + * (`Config::Backend::alloc_chunk` / `dealloc_chunk`), so the + * per-chunk contract is gated independently of any front-end path. + * This test gates that the front-end actually produces such + * allocations. + * + * Two sets of checks: + * + * 1. Pure table-level round-tripping over every large sizeclass: + * `size_to_sizeclass_full(sizeclass_full_to_size(sc)) == sc`. + * No allocation. Cheap and exhaustive. + * + * 2. End-to-end on a bounded set of representative sizeclasses + * (the smallest non-pow2 large class, plus a non-boundary + * request whose smallest enclosing class is non-pow2): allocate + * via the public front-end API, walk every chunk-aligned + * interior pointer in the logical allocation, assert + * `external_pointer` recovers the base and + * `remaining_bytes` reports the expected residual. + */ + +#include "test/setup.h" + +#include +#include + +#ifdef assert +# undef assert +#endif +#define assert please_use_SNMALLOC_ASSERT + +using namespace snmalloc; + +namespace +{ + bool any_failures = false; + + void fail(const char* msg) + { + std::cout << "FAIL: " << msg << std::endl; + any_failures = true; + } + + /** + * For every representable large sizeclass `sc`, check that the + * sizeclass encoding round-trips: a request of exactly + * `sizeclass_full_to_size(sc)` maps back to `sc`. Failure here is + * a pure table-encoding bug and is independent of any allocation. + */ + void test_roundtrip_all_large() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + sizeclass_t sc = sizeclass_t::from_large_class(lc); + size_t S = sizeclass_full_to_size(sc); + sizeclass_t sc2 = size_to_sizeclass_full(S); + if (!(sc2 == sc)) + { + std::cout << "Round-trip fail: lc=" << lc << " S=" << S + << " sc.raw=" << sc.raw() << " sc2.raw=" << sc2.raw() + << std::endl; + fail("round-trip"); + } + } + } + + /** + * Allocate `request` via the public front-end, then walk every + * `MIN_CHUNK_SIZE`-aligned interior address and verify pointer + * recovery. `expected_reserve` is the reservation the allocator + * should produce (the smallest enclosing sizeclass size). + */ + void test_alloc_chunkwalk(size_t request, size_t expected_reserve) + { + void* p = snmalloc::libc::malloc(request); + if (p == nullptr) + { + fail("malloc returned null"); + return; + } + + const size_t usable = snmalloc::alloc_size(p); + if (usable != expected_reserve) + { + std::cout << "alloc_size mismatch: request=" << request + << " usable=" << usable << " expected=" << expected_reserve + << std::endl; + fail("alloc_size != expected reserve"); + } + + // Use the `Start` pointer recovery as the start-of-object check + // (no `libc::is_start_of_object`): `external_pointer(p)` + // returning `p` itself is the same property. + + for (size_t off = 0; off < usable; off += MIN_CHUNK_SIZE) + { + void* interior = pointer_offset(p, off); + void* base = snmalloc::external_pointer(interior); + if (base != p) + { + std::cout << "external_pointer(p + " << off << ") = " << base + << " expected " << p << std::endl; + fail("external_pointer mismatch"); + } + size_t rem = snmalloc::remaining_bytes(interior); + if (rem != usable - off) + { + std::cout << "remaining_bytes(p + " << off << ") = " << rem + << " expected " << usable - off << std::endl; + fail("remaining_bytes mismatch"); + } + } + + snmalloc::libc::free(p); + } + + /** + * Find a non-pow2 large sizeclass to exercise. Returns the + * sentinel `sizeclass_t{}` if none exists (e.g. INTERMEDIATE_BITS + * == 0, all classes are pow2). + */ + sizeclass_t find_non_pow2_large_sc() + { + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + sizeclass_t sc = sizeclass_t::from_large_class(lc); + size_t S = sizeclass_full_to_size(sc); + if (!bits::is_pow2(S)) + return sc; + } + return sizeclass_t{}; + } + + void test_end_to_end() + { + sizeclass_t sc = find_non_pow2_large_sc(); + if (sc.raw() == 0) + { + std::cout + << "No non-pow2 large sizeclass available (INTERMEDIATE_BITS == 0?); " + "skipping end-to-end test." + << std::endl; + return; + } + + const size_t S = sizeclass_full_to_size(sc); + + // Boundary request: ask for exactly the class size. + test_alloc_chunkwalk(S, S); + + // Non-boundary request: ask for (S_prev + 1) to land at S via + // the ceil encoding. S_prev is the previous class's size; if sc + // is the very first large class, fall back to MAX_SMALL+1. + size_t S_prev; + if (sc.as_large() == 0) + { + S_prev = MAX_SMALL_SIZECLASS_SIZE; + } + else + { + S_prev = sizeclass_full_to_size( + sizeclass_t::from_large_class(sc.as_large() - 1)); + } + if (S_prev + 1 < S) + { + test_alloc_chunkwalk(S_prev + 1, S); + } + } +} // namespace + +int main() +{ + setup(); + test_roundtrip_all_large(); + test_end_to_end(); + if (any_failures) + { + std::cout << "FAILED" << std::endl; + return 1; + } + std::cout << "PASSED" << std::endl; + return 0; +} diff --git a/src/test/func/memory/memory.cc b/src/test/func/memory/memory.cc index 253628282..6be2865a8 100644 --- a/src/test/func/memory/memory.cc +++ b/src/test/func/memory/memory.cc @@ -307,12 +307,19 @@ void check_offset(void* base, void* interior) void check_external_pointer_large(size_t* base) { + // Probe `__malloc_start_pointer` at both ends of each 16 MiB + // stride within the allocation. The allocation size is recorded in + // the first word of the allocation itself. The end-of-stride probe + // is clamped to the last byte of the allocation. size_t size = *base; char* curr = (char*)base; for (size_t offset = 0; offset < size; offset += 1 << 24) { check_offset(base, (void*)(curr + offset)); - check_offset(base, (void*)(curr + offset + (1 << 24) - 1)); + size_t end = offset + (1 << 24) - 1; + if (end >= size) + end = size - 1; + check_offset(base, (void*)(curr + end)); } } @@ -439,6 +446,49 @@ void test_calloc_large_bug() snmalloc::dealloc(p1); } +/** + * `calloc` zeroing must cover exactly the reservation `round_size` + * reports — no more, no less. For a large request that lands in a + * non-pow2 sizeclass, the reservation is tighter than the next pow2, + * so a stray `next_pow2`-sized zeroing loop would overshoot into + * backend free range. This test allocates such a non-pow2 large + * request and verifies (a) the usable size is strictly less than the + * next pow2, and (b) every byte of the visible allocation is zero. + * + * Note: an overshoot may not fault — the deterministic gate for the + * `round_size` contract lives in the sizeclass test. + */ +void test_calloc_non_pow2_large() +{ + if constexpr (snmalloc::INTERMEDIATE_BITS == 0) + { + // All sizeclasses are powers of two in this configuration, so + // there is no non-pow2 large request to test. + std::cout << "INTERMEDIATE_BITS == 0: all sizeclasses pow2; skipping." + << std::endl; + return; + } + + // 2.5 * MAX_SMALL_SIZECLASS_SIZE: definitely large, definitely not + // a power of two, and (with INTERMEDIATE_BITS >= 1) the smallest + // enclosing sizeclass is strictly less than the next pow2 above. + const size_t mss = size_t{1} << snmalloc::max_small_sizeclass_bits(); + const size_t request = (mss << 1) + (mss >> 1); + const size_t next_pow2 = snmalloc::bits::next_pow2(request); + + void* p = snmalloc::alloc(request); + SNMALLOC_CHECK(p != nullptr); + const size_t usable = snmalloc::alloc_size(p); + SNMALLOC_CHECK(usable >= request); + SNMALLOC_CHECK(usable < next_pow2); + auto* bytes = static_cast(p); + for (size_t i = 0; i < usable; i++) + { + SNMALLOC_CHECK(bytes[i] == 0); + } + snmalloc::dealloc(p); +} + template void test_static_sized_alloc() { @@ -589,6 +639,7 @@ int main(int, char**) TEST(test_external_pointer); TEST(test_alloc_16M); TEST(test_calloc_16M); + TEST(test_calloc_non_pow2_large); TEST(test_consolidaton_bug); std::cout << "Tests completeed successfully!" << std::endl; diff --git a/src/test/func/sizeclass/sizeclass.cc b/src/test/func/sizeclass/sizeclass.cc index 093b17424..0b0c73eb3 100644 --- a/src/test/func/sizeclass/sizeclass.cc +++ b/src/test/func/sizeclass/sizeclass.cc @@ -140,11 +140,12 @@ void test_uniform_large_sizeclasses() prev_size = size; } - // Round-trip identity on pow2 large sizes in Phase 13: every pow2 size - // S in [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must satisfy - // sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. Bound the loop by - // ENCODED_ADDRESS_BITS so `bits::one_at_bit(bits)` never shifts by >= BITS - // (the bound check itself would fail on 32-bit otherwise). + // Round-trip identity on pow2 large sizes: every pow2 size S in + // [MAX_SMALL_SIZECLASS_SIZE * 2, MAX_LARGE_SIZECLASS_SIZE] must + // satisfy sizeclass_full_to_size(size_to_sizeclass_full(S)) == S. + // Bound the loop by ENCODED_ADDRESS_BITS so `bits::one_at_bit(b)` + // never shifts by >= BITS (the bound check itself would fail on + // 32-bit otherwise). for (size_t b = MAX_SMALL_SIZECLASS_BITS + 1; b <= ENCODED_ADDRESS_BITS; b++) { size_t S = bits::one_at_bit(b); @@ -157,21 +158,74 @@ void test_uniform_large_sizeclasses() failed = true; } - // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), the - // result must round up to 2P (pow2 rounding still in force in Phase 13). - // Only check when 2P is still representable. + // For every non-pow2 size X strictly between adjacent pow2 [P, 2P), + // `size_to_sizeclass_full(X)` must select the smallest sizeclass + // whose size is >= X. Compute the expected sizeclass independently + // by scanning all large classes. Only check when 2P is still + // representable. if (b < ENCODED_ADDRESS_BITS) { size_t mid = S + (S >> 1); sizeclass_t sc_mid = size_to_sizeclass_full(mid); size_t rs_mid = sizeclass_full_to_size(sc_mid); - size_t expect = bits::one_at_bit(b + 1); - if (rs_mid != expect) + + // Independent computation: smallest large class size >= mid. + size_t expect = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t sz = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (sz >= mid) + { + expect = sz; + break; + } + } + if (expect == 0) { - std::cout << "Non-pow2 should round to next pow2: X=" << mid - << " round=" << rs_mid << " expected=" << expect << std::endl; + std::cout << "No large class >= mid=" << mid << std::endl; failed = true; } + else if (rs_mid != expect) + { + std::cout << "Non-pow2 should round to smallest enclosing class: X=" + << mid << " round=" << rs_mid << " expected=" << expect + << std::endl; + failed = true; + } + } + } + + // `round_size` contract: for every representable large class size + // S, `round_size(S) == S` and `round_size(S_prev + 1) == S` (the + // smallest enclosing class). `DefaultConts::success` (corealloc.h) + // uses `round_size` to size the `calloc` zeroing range, so any + // drift here would over- or under-zero. This is the deterministic + // gate for that contract; the `calloc` smoke test in `memory.cc` + // would not necessarily fault on an overshoot into backend free + // range. + { + size_t prev = 0; + for (size_t lc = 0; lc < NUM_LARGE_CLASSES; lc++) + { + size_t S = sizeclass_full_to_size(sizeclass_t::from_large_class(lc)); + if (round_size(S) != S) + { + std::cout << "round_size identity failed at large class: S=" << S + << " round_size=" << round_size(S) << std::endl; + failed = true; + } + if (prev != 0 && prev + 1 < S) + { + size_t probe = prev + 1; + if (round_size(probe) != S) + { + std::cout << "round_size(prev+1) blow-up: probe=" << probe + << " round_size=" << round_size(probe) << " expected=" << S + << std::endl; + failed = true; + } + } + prev = S; } } From af980c05e5f24ce4f65cf3f32ad45f35924456b8 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Mon, 8 Jun 2026 21:12:31 +0100 Subject: [PATCH 12/15] Add non-pow2 metadata sub-allocator: InplaceRep + SmallArenaRange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the building blocks for the SmallBuddyRange -> SmallArenaRange migration. Nothing is wired into the production pipeline yet (the existing SmallBuddyRange remains the LocalMetaRange) — this commit only adds the new components and their gate test. * InplaceRep: in-band red-black-tree node Rep for Arena that stores the tree pointers inside the free block itself. Supports CHERI provenance via the Authmap mechanism (the same write-once cap table used by dealloc_meta_data); node accesses go through Authmap::amplify_from_address. can_consolidate refuses merging across MIN_CHUNK_SIZE boundaries to keep Arena's MAX_SIZE_BITS == MIN_CHUNK_BITS invariant intact. * SmallArenaRange::Type: a wrapper around Arena, MIN_BITS, MIN_CHUNK_BITS> presenting the standard Range interface. Serves arbitrarily-unit-aligned sizes (not just powers of two). Replaces the historical alloc_range_with_leftover with alloc_size_with_align(size, align), which makes alignment an explicit parameter and donates the unit-aligned tail back to the arena. * amplify_from_address(address_t) on DummyAuthmap (pass-through reinterpret_cast) and BasicAuthmap (lookup + pointer_offset). Lets InplaceRep recover an arena cap for an address it knows only as an integer. * New test target smallarenarange covering the rep accessor round-trips, arena add/remove/consolidation/carve, a 30-seed x 500-op stress, the can_consolidate chunk-boundary refusal, and four alloc_size_with_align scenarios (exact fit, pow2 align over non-pow2 size, align larger than size, MIN_CHUNK_SIZE bypass). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- CMakeLists.txt | 2 +- src/snmalloc/backend_helpers/authmap.h | 30 + .../backend_helpers/backend_helpers.h | 1 + src/snmalloc/backend_helpers/inplacerep.h | 278 +++++++ .../backend_helpers/smallarenarange.h | 166 ++++ .../func/smallarenarange/smallarenarange.cc | 765 ++++++++++++++++++ 6 files changed, 1241 insertions(+), 1 deletion(-) create mode 100644 src/snmalloc/backend_helpers/inplacerep.h create mode 100644 src/snmalloc/backend_helpers/smallarenarange.h create mode 100644 src/test/func/smallarenarange/smallarenarange.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index bbe6eeabc..7bed03128 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -548,7 +548,7 @@ if(NOT SNMALLOC_HEADER_ONLY_LIBRARY) # These are mitigation-independent and can be compiled once, then linked # against both fast and check testlib variants. set(TESTLIB_ONLY_TESTS - aligned_dealloc arena arenabins largearenarange + aligned_dealloc arena arenabins largearenarange smallarenarange bits first_operation memory memory_usage multi_atexit multi_threadatexit redblack statistics teardown contention external_pointer large_alloc lotsofthreads post_teardown diff --git a/src/snmalloc/backend_helpers/authmap.h b/src/snmalloc/backend_helpers/authmap.h index e2a00085b..c0ad74258 100644 --- a/src/snmalloc/backend_helpers/authmap.h +++ b/src/snmalloc/backend_helpers/authmap.h @@ -23,6 +23,19 @@ namespace snmalloc { return capptr::Arena::unsafe_from(c.unsafe_ptr()); } + + /** + * Address-keyed sibling of `amplify`: returns a capability with + * address `a` and (on real capability hardware) the registered + * arena's permissions. The non-StrictProvenance pass-through + * variant simply fabricates a pointer at `a`. + */ + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + return capptr::Arena::unsafe_from(reinterpret_cast(a)); + } }; /** @@ -67,6 +80,23 @@ namespace snmalloc concreteAuthmap.template get(address_cast(c)), c); } + + /** + * Address-keyed sibling of `amplify`: returns a capability at + * address `a` with the registered arena's permissions, suitable + * for cases where the caller holds only an integer address (for + * example, in-band tree-node access in `InplaceRep`). The + * authmap is set once per arena registration and never mutated + * thereafter, so this lookup is safe under concurrent allocator + * activity. + */ + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + auto arena = concreteAuthmap.template get(a); + return pointer_offset(arena, a - address_cast(arena)); + } }; /** diff --git a/src/snmalloc/backend_helpers/backend_helpers.h b/src/snmalloc/backend_helpers/backend_helpers.h index 5311499df..5b75bcdbc 100644 --- a/src/snmalloc/backend_helpers/backend_helpers.h +++ b/src/snmalloc/backend_helpers/backend_helpers.h @@ -17,6 +17,7 @@ #include "pagemapregisterrange.h" #include "palrange.h" #include "range_helpers.h" +#include "smallarenarange.h" #include "smallbuddyrange.h" #include "staticconditionalrange.h" #include "statsrange.h" diff --git a/src/snmalloc/backend_helpers/inplacerep.h b/src/snmalloc/backend_helpers/inplacerep.h new file mode 100644 index 000000000..3aacfb410 --- /dev/null +++ b/src/snmalloc/backend_helpers/inplacerep.h @@ -0,0 +1,278 @@ +#pragma once + +#include "../ds_core/bits.h" +#include "../ds_core/defines.h" +#include "../ds_core/sizeclassconfig.h" +#include "arena.h" + +#include + +namespace snmalloc +{ + /** + * In-band tree node stored at the head of a free block managed by + * `Arena`. Two pointer-sized words per unit; bit-packing of + * red and variant tags lives in `word_one`. Stored as `uintptr_t` + * so we can OR meta bits into the pointer slot without UB on + * non-capability platforms (on CHERI, capabilities to access these + * words are re-derived from the `Authmap` — see `InplaceRep`). + */ + template + struct InplaceNode + { + uintptr_t word_one; + uintptr_t word_two; + }; + + /** + * In-band `Rep` for `Arena`. Each free block carries its + * own tree-node and metadata storage in its first few units: + * + * Unit 0 (addr): bin-tree node + variant tag. + * Unit 1 (addr + UNIT_SIZE): range-tree node (size >= 2 units). + * Unit 2 (addr + 2*UNIT_SIZE): large-size word (size >= 3 units). + * + * Bit layout in `word_one` of each unit: + * bit 0 : red bit (both trees) + * bits 1..2 : variant tag (`ArenaVariant`, unit 0 only) + * `word_two` holds the second child pointer with no packed meta. + * Both child pointers are unit-aligned, so their low `MIN_BITS` + * bits are zero — the packed meta occupies bits below + * `1 << MIN_BITS` and never collides with a stored pointer value. + * + * `MIN_BITS = next_pow2_bits_const(sizeof(InplaceNode))`: the + * smallest free block must hold one tree node, so the unit IS the + * node footprint rounded up. + * + * CHERI: in-band storage is accessed via + * `Authmap::amplify_from_address(addr)`, which returns a + * capability at `addr` with the registered arena's permissions. + * The authmap is set once per arena registration and never + * mutated, so this lookup carries no concurrency hazard. On + * non-CHERI platforms the authmap is the pass-through + * `DummyAuthmap` and the cap collapses to a raw pointer. + */ + template + class InplaceRep + { + public: + static constexpr size_t MIN_BITS = + bits::next_pow2_bits_const(sizeof(InplaceNode)); + static constexpr size_t UNIT_SIZE = size_t(1) << MIN_BITS; + + // 3 meta bits (variant 2 + red 1) packed below the unit + // alignment boundary. Block addresses are UNIT_SIZE-aligned, so + // a value v with `(v & (UNIT_SIZE - 1)) == 0` writes the + // pointer cleanly without touching meta. + static_assert(MIN_BITS >= 3, "Need 3 low bits for red+variant packing"); + static_assert(MIN_BITS < MIN_CHUNK_BITS, "Arena needs a non-trivial range"); + static_assert( + MIN_ALLOC_SIZE >= (size_t(1) << MIN_BITS), + "Front-end minimum allocation must be >= in-band unit size; " + "otherwise a free block cannot hold the tree node."); + + static constexpr uintptr_t RED_BIT = 1; + static constexpr unsigned VARIANT_SHIFT = 1; + static constexpr unsigned VARIANT_BITS = 2; + static constexpr uintptr_t VARIANT_MASK = + ((uintptr_t(1) << VARIANT_BITS) - 1) << VARIANT_SHIFT; + static constexpr uintptr_t BIN_META_MASK = RED_BIT | VARIANT_MASK; + static constexpr uintptr_t RANGE_META_MASK = RED_BIT; + + static_assert(BIN_META_MASK < UNIT_SIZE); + + /** + * Wraps a `uintptr_t*` storage slot plus the meta-bit mask that + * this slot owns. `get()` returns the slot value with meta bits + * cleared; assignment preserves them. Mirrors the role of + * `BackendStateWordRef` but with an inline mask field (we own + * the only mask here, unlike `BackendStateWordRef` which layers + * on top of the frontend-reserved mask). + */ + class Handle + { + uintptr_t* val{nullptr}; + uintptr_t mask{0}; + + public: + constexpr Handle() = default; + + constexpr Handle(uintptr_t* v, uintptr_t m) : val(v), mask(m) {} + + /** + * Single-pointer constructor required by the `RBRepMethods` + * concept (`ds_core/redblacktree.h:64-67`) for sentinel + * construction from `&Rep::root`. The tree's root field + * carries no meta bits, so mask defaults to zero. + */ + constexpr Handle(uintptr_t* v) : val(v) {} + + [[nodiscard]] uintptr_t get() const + { + return *val & ~mask; + } + + Handle& operator=(uintptr_t v) + { + SNMALLOC_ASSERT((v & mask) == 0); + *val = v | (*val & mask); + return *this; + } + + bool operator!=(const Handle& other) const + { + return val != other.val; + } + + uintptr_t printable_address() const + { + return reinterpret_cast(val); + } + }; + + private: + template + static InplaceNode* unit_at(uintptr_t addr) + { + auto cap = Authmap::amplify_from_address(addr + UnitIdx * UNIT_SIZE); + return static_cast*>(cap.unsafe_ptr()); + } + + /** + * Tree rep shared by `BinRep` and `RangeRep`. `UnitIdx` is the + * block-relative unit (0 or 1) that holds this rep's node; + * `MetaMask` covers the bits in that unit's `word_one` owned + * by this rep (red + variant for `BinRep`, red only for + * `RangeRep`) and is preserved across `set`. + * + * Convention (mirrors `PagemapRep`): direction `true` selects + * `word_one` (the meta-bearing word); direction `false` + * selects `word_two`. + */ + template + struct TreeRep + { + using Handle = InplaceRep::Handle; + using Contents = uintptr_t; + + static constexpr Contents null = 0; + static constexpr Contents root = 0; + + static Handle ref(bool direction, Contents k) + { + // Sentinel handle for the null key, mirroring + // `PagemapRep::TreeRep::ref`. Reads return 0; writes are + // disallowed by the tree's algorithm but the storage is + // still backing in case of accidental writes during + // debugging. + static uintptr_t null_entry = 0; + if (SNMALLOC_UNLIKELY(k == 0)) + return Handle{&null_entry, 0}; + auto* node = unit_at(k); + return direction ? Handle{&node->word_one, MetaMask} : + Handle{&node->word_two, 0}; + } + + static Contents get(Handle h) + { + return h.get(); + } + + static void set(Handle h, Contents v) + { + h = v; + } + + static bool is_red(Contents k) + { + if (k == 0) + return false; + return (unit_at(k)->word_one & RED_BIT) != 0; + } + + static void set_red(Contents k, bool new_is_red) + { + auto* w = &unit_at(k)->word_one; + if (((*w & RED_BIT) != 0) != new_is_red) + *w ^= RED_BIT; + SNMALLOC_ASSERT(is_red(k) == new_is_red); + } + + static bool compare(Contents k1, Contents k2) + { + return k1 > k2; + } + + static bool equal(Contents k1, Contents k2) + { + return k1 == k2; + } + + static uintptr_t printable(Contents k) + { + return k; + } + + static uintptr_t printable(Handle h) + { + return h.printable_address(); + } + + static const char* name() + { + return Name; + } + }; + + static constexpr char BIN_REP_NAME[] = "InplaceBinRep"; + static constexpr char RANGE_REP_NAME[] = "InplaceRangeRep"; + + public: + using BinRep = TreeRep<0, BIN_META_MASK, BIN_REP_NAME>; + using RangeRep = TreeRep<1, RANGE_META_MASK, RANGE_REP_NAME>; + + static ArenaVariant get_variant(uintptr_t addr) + { + auto w = unit_at<0>(addr)->word_one; + return static_cast((w & VARIANT_MASK) >> VARIANT_SHIFT); + } + + static void set_variant(uintptr_t addr, ArenaVariant v) + { + auto* w = &unit_at<0>(addr)->word_one; + *w = (*w & ~VARIANT_MASK) | (static_cast(v) << VARIANT_SHIFT); + } + + /** + * Exact byte size for `Large` blocks. Stored as a plain + * `uintptr_t` in unit 2's `word_one`; unlike `PagemapRep` we + * do not need to compress (the pagemap word has reserved low + * bits but our in-band word has the full width). + */ + static size_t get_large_size(uintptr_t addr) + { + return static_cast(unit_at<2>(addr)->word_one); + } + + static void set_large_size(uintptr_t addr, size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + unit_at<2>(addr)->word_one = static_cast(size); + } + + /** + * Refuse consolidation across `MIN_CHUNK_SIZE` boundaries. + * `SmallArenaRange::add_range_impl` splits incoming ranges at + * chunk boundaries, but does not eagerly merge across them on + * the wrapper side; this check is what stops `Arena` + * from later merging two adjacent intra-chunk fragments that + * happen to abut the same chunk boundary, which would create a + * free block straddling chunks. Chunk-aligned `higher_addr` + * means the lower neighbour ends at a chunk boundary — refuse. + */ + static bool can_consolidate(uintptr_t higher_addr) + { + return (higher_addr & (MIN_CHUNK_SIZE - 1)) != 0; + } + }; +} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/smallarenarange.h b/src/snmalloc/backend_helpers/smallarenarange.h new file mode 100644 index 000000000..f5820c8f0 --- /dev/null +++ b/src/snmalloc/backend_helpers/smallarenarange.h @@ -0,0 +1,166 @@ +#pragma once + +#include "../pal/pal.h" +#include "arena.h" +#include "empty_range.h" +#include "inplacerep.h" +#include "range_helpers.h" + +namespace snmalloc +{ + /** + * Small-grained range backed by `Arena` with in-band + * (`InplaceRep`) tree-node storage. Serves blocks of any + * unit-aligned size — not restricted to powers of two — for + * `SlabMetadata` allocations. + * + * Each arena instance covers exactly one chunk + * (`MAX_SIZE_BITS = MIN_CHUNK_BITS`): refill takes one chunk + * from the parent, sub-chunk fragments live in the arena, + * consolidated whole chunks flow back to the parent. + */ + template + struct SmallArenaRange + { + template> + class Type : public ContainsParent + { + public: + using ChunkBounds = typename ParentRange::ChunkBounds; + + private: + using ContainsParent::parent; + + using RepT = InplaceRep; + static constexpr size_t MIN_BITS = RepT::MIN_BITS; + + Arena arena; + + public: + static constexpr size_t UNIT_SIZE = RepT::UNIT_SIZE; + + private: + /** + * Split `[base, base+length)` at chunk boundaries. + * Intra-chunk fragments are unit-trimmed and submitted to + * the arena; segments that begin and end chunk-aligned go + * to the parent. Accepts arbitrary unaligned input — + * `dealloc_meta_data` forwards `make()`'s unaligned spare + * here; sub-unit edges are discarded by design. + */ + void add_range_impl(CapPtr base, size_t length) + { + uintptr_t lo = base.unsafe_uintptr(); + uintptr_t hi = lo + length; + + while (lo < hi) + { + uintptr_t chunk_end = bits::align_up(lo + 1, MIN_CHUNK_SIZE); + uintptr_t seg_end = bits::min(hi, chunk_end); + + if ( + lo == bits::align_down(lo, MIN_CHUNK_SIZE) && seg_end == chunk_end) + { + auto chunk_base = CapPtr::unsafe_from( + reinterpret_cast(lo)); + parent.dealloc_range(chunk_base, MIN_CHUNK_SIZE); + } + else + { + uintptr_t f_lo = bits::align_up(lo, UNIT_SIZE); + uintptr_t f_hi = bits::align_down(seg_end, UNIT_SIZE); + if (f_lo < f_hi) + { + auto [ov_a, ov_s] = arena.add_block(f_lo, f_hi - f_lo); + if (ov_a != 0) + { + // Arena consolidated up to MAX_SIZE_BITS = chunk: + // hand the whole-chunk piece back to the parent. + auto ov_base = CapPtr::unsafe_from( + reinterpret_cast(ov_a)); + parent.dealloc_range(ov_base, ov_s); + } + } + } + + lo = seg_end; + } + } + + CapPtr refill(size_t size) + { + auto refill_range = parent.alloc_range(MIN_CHUNK_SIZE); + if (refill_range == nullptr) + return nullptr; + + add_range_impl( + pointer_offset(refill_range, size), MIN_CHUNK_SIZE - size); + + return refill_range; + } + + public: + static constexpr bool Aligned = true; + static_assert(ParentRange::Aligned, "ParentRange must be aligned"); + + static constexpr bool ConcurrencySafe = false; + + constexpr Type() = default; + + CapPtr alloc_range(size_t size) + { + SNMALLOC_ASSERT((size & (UNIT_SIZE - 1)) == 0); + + if (size >= MIN_CHUNK_SIZE) + return parent.alloc_range(size); + + uintptr_t a = arena.remove_block(size); + if (a != 0) + return CapPtr::unsafe_from( + reinterpret_cast(a)); + + return refill(size); + } + + /** + * Allocate `align`-aligned space large enough for `size`, + * donating the unit-aligned tail back to the arena. + * + * Requests `requested = align_up(size, align)` bytes; because + * `align` is pow2 and `requested` is a multiple of `align`, + * `Arena`'s carve returns an `align`-aligned base + * without a caller-side over-allocate-and-trim. The tail + * `[align_up(size, UNIT_SIZE), requested)` is donated via + * `add_range_impl`. The sub-unit slice + * `[size, align_up(size, UNIT_SIZE))` cannot be represented + * and is leaked — pre-round `size` to `UNIT_SIZE` to avoid it. + */ + CapPtr alloc_size_with_align(size_t size, size_t align) + { + SNMALLOC_ASSERT(size > 0); + SNMALLOC_ASSERT(bits::is_pow2(align)); + SNMALLOC_ASSERT(align >= UNIT_SIZE); + SNMALLOC_ASSERT(align <= MIN_CHUNK_SIZE); + + size_t requested = bits::align_up(size, align); + auto p = alloc_range(requested); + if (p == nullptr) + return nullptr; + + size_t used = bits::align_up(size, UNIT_SIZE); + if (used < requested) + { + add_range_impl(pointer_offset(p, used), requested - used); + } + + return p; + } + + // No precondition on `size`: sub-unit edges discarded. + void dealloc_range(CapPtr base, size_t size) + { + add_range_impl(base, size); + } + }; + }; +} // namespace snmalloc diff --git a/src/test/func/smallarenarange/smallarenarange.cc b/src/test/func/smallarenarange/smallarenarange.cc new file mode 100644 index 000000000..47d6b895c --- /dev/null +++ b/src/test/func/smallarenarange/smallarenarange.cc @@ -0,0 +1,765 @@ +/** + * Unit tests for `InplaceRep` exercised through `Arena`. + * + * Distinct from the `arena` test (which uses an array-backed + * MockRep): here the Rep is the in-band representation, + * and each free block's tree-node storage lives at the block's own + * head bytes. The test allocates a single chunk-aligned backing + * buffer and treats addresses within it as block bases. + */ + +#include "test/setup.h" +#include "test/snmalloc_testlib.h" +#include "test/xoroshiro.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace snmalloc +{ + using Rep = InplaceRep; + static constexpr size_t UNIT_SIZE = Rep::UNIT_SIZE; + static constexpr size_t MIN_BITS = Rep::MIN_BITS; + + // Arena spans one chunk's worth of space (max block size = + // MIN_CHUNK_SIZE - UNIT_SIZE, since the arena's MAX is exclusive). + static constexpr size_t MAX_SIZE_BITS = MIN_CHUNK_BITS; + using TestArena = Arena; + + // Backing buffer: must be UNIT_SIZE-aligned so block bases are + // unit-aligned and the in-band node fields land at the expected + // offsets. Sized to comfortably cover the arena's full range plus + // a small base offset that keeps block addresses non-zero (zero + // is the tree null sentinel). Oversized by MIN_CHUNK_SIZE so the + // base can be aligned up at runtime — MSVC rejects alignas values + // as large as MIN_CHUNK_SIZE on static storage. + static unsigned char backing[3 * MIN_CHUNK_SIZE]; + + static uintptr_t base_addr() + { + // Round up to MIN_CHUNK_SIZE, then offset by MIN_CHUNK_SIZE to + // keep addresses well clear of zero. + uintptr_t raw = reinterpret_cast(&backing[0]); + uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1); + return aligned + MIN_CHUNK_SIZE; + } + + static void reset_backing() + { + for (size_t i = 0; i < sizeof(backing); i++) + backing[i] = 0; + } + + static uintptr_t unit_addr(size_t unit_idx) + { + return base_addr() + unit_idx * UNIT_SIZE; + } + + static constexpr size_t unit_size(size_t n_units) + { + return n_units * UNIT_SIZE; + } + + // ================================================================== + // (A) Round-trip: variant tag and large-size storage survive + // independent of bin/range pointer writes. + // ================================================================== + + static void test_variant_roundtrip() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + for (auto v : + {ArenaVariant::Min, + ArenaVariant::EvenTwo, + ArenaVariant::OddTwo, + ArenaVariant::Large}) + { + Rep::set_variant(a, v); + SNMALLOC_CHECK(Rep::get_variant(a) == v); + } + + // Variant tag must not interfere with the red bit at bit 0. + Rep::set_variant(a, ArenaVariant::OddTwo); + Rep::BinRep::set_red(a, true); + SNMALLOC_CHECK(Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo); + + Rep::BinRep::set_red(a, false); + SNMALLOC_CHECK(!Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::get_variant(a) == ArenaVariant::OddTwo); + + printf(" Variant + red roundtrip: OK\n"); + } + + static void test_large_size_roundtrip() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + for (size_t s : {unit_size(3), unit_size(7), unit_size(17), unit_size(125)}) + { + Rep::set_large_size(a, s); + SNMALLOC_CHECK(Rep::get_large_size(a) == s); + } + + printf(" Large-size roundtrip: OK\n"); + } + + // ================================================================== + // (B) Bin-tree and range-tree red bits live in different units and + // must not alias. + // ================================================================== + + static void test_red_bits_independent() + { + reset_backing(); + uintptr_t a = unit_addr(0); + + Rep::BinRep::set_red(a, true); + Rep::RangeRep::set_red(a, false); + SNMALLOC_CHECK(Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(!Rep::RangeRep::is_red(a)); + + Rep::BinRep::set_red(a, false); + Rep::RangeRep::set_red(a, true); + SNMALLOC_CHECK(!Rep::BinRep::is_red(a)); + SNMALLOC_CHECK(Rep::RangeRep::is_red(a)); + + printf(" Bin/range red bits independent: OK\n"); + } + + // ================================================================== + // (B2) `can_consolidate` refuses chunk-boundary merges. + // SmallArenaRange splits incoming ranges at chunk boundaries, but + // adjacent intra-chunk fragments meeting at a boundary would + // otherwise be merged by Arena. The predicate is what + // prevents that. + // ================================================================== + + static void test_can_consolidate_chunk_boundary() + { + // Chunk-aligned higher_addr means the lower neighbour ends at + // a chunk boundary — refuse. + SNMALLOC_CHECK(!Rep::can_consolidate(MIN_CHUNK_SIZE)); + SNMALLOC_CHECK(!Rep::can_consolidate(2 * MIN_CHUNK_SIZE)); + // Non-chunk-aligned higher_addr is fine to merge. + SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE + UNIT_SIZE)); + SNMALLOC_CHECK(Rep::can_consolidate(MIN_CHUNK_SIZE - UNIT_SIZE)); + SNMALLOC_CHECK(Rep::can_consolidate(UNIT_SIZE)); + + printf(" can_consolidate chunk-boundary refuse: OK\n"); + } + + // ================================================================== + // (C) Through the arena: add a single block and remove it. + // ================================================================== + + static void test_arena_add_remove_single() + { + reset_backing(); + TestArena arena; + arena.check_invariant(true); + + auto a = unit_addr(0); + auto [ov_a, ov_s] = arena.add_block(a, unit_size(4)); + SNMALLOC_CHECK(ov_a == 0 && ov_s == 0); + arena.check_invariant(true); + + auto got = arena.remove_block(unit_size(4)); + SNMALLOC_CHECK(got == a); + arena.check_invariant(true); + + printf(" Arena add/remove single: OK\n"); + } + + // ================================================================== + // (D) Consolidation across two adjacent free blocks. + // ================================================================== + + static void test_arena_consolidation() + { + reset_backing(); + TestArena arena; + + auto a = unit_addr(0); + auto b = unit_addr(4); + arena.add_block(a, unit_size(4)); + arena.check_invariant(true); + auto [ov_a, ov_s] = arena.add_block(b, unit_size(4)); + SNMALLOC_CHECK(ov_a == 0 && ov_s == 0); + arena.check_invariant(true); + + // A single 8-unit block should now be removable from the + // consolidated region. + auto got = arena.remove_block(unit_size(8)); + SNMALLOC_CHECK(got == a); + arena.check_invariant(true); + + printf(" Arena consolidation: OK\n"); + } + + // ================================================================== + // (E) Carving: request a smaller size than the free block has. + // ================================================================== + + static void test_arena_carve() + { + reset_backing(); + TestArena arena; + + auto a = unit_addr(0); + arena.add_block(a, unit_size(8)); + arena.check_invariant(true); + + auto got = arena.remove_block(unit_size(3)); + SNMALLOC_CHECK(got != 0); + arena.check_invariant(true); + + // The remainder is still available; total removed should sum to + // 8 units across this and subsequent removes. + size_t total_removed = 3; + while (true) + { + auto r = arena.remove_block(unit_size(1)); + if (r == 0) + break; + total_removed += 1; + arena.check_invariant(true); + } + SNMALLOC_CHECK(total_removed == 8); + + printf(" Arena carve + drain: OK\n"); + } + + // ================================================================== + // (F) Randomised stress: oracle-checked add/remove over a single + // chunk's worth of units. Equivalent to the MockRep stress test in + // shape but operates on real in-band storage. + // ================================================================== + + static constexpr size_t STRESS_UNITS = + (size_t(1) << MAX_SIZE_BITS) / UNIT_SIZE - 1; + + using Bins = ArenaBins<2, MIN_BITS>; + + struct OracleRange + { + size_t addr_units; + size_t size_units; + + bool operator<(const OracleRange& o) const + { + return addr_units < o.addr_units; + } + }; + + // Mirrors the arena's bin-based allocator: classify entries into + // bins, pick the bin via the bitmap's find_for_request, then + // pick the lowest-address entry within that bin and carve. + class Oracle + { + std::set ranges; + + public: + void add(size_t addr_units, size_t size_units) + { + OracleRange key{addr_units, size_units}; + auto it = ranges.lower_bound(key); + + size_t new_addr = addr_units; + size_t new_size = size_units; + + if (it != ranges.end() && it->addr_units == new_addr + new_size) + { + new_size += it->size_units; + it = ranges.erase(it); + } + + if (it != ranges.begin()) + { + auto prev = std::prev(it); + if (prev->addr_units + prev->size_units == new_addr) + { + new_addr = prev->addr_units; + new_size += prev->size_units; + ranges.erase(prev); + } + } + + ranges.insert({new_addr, new_size}); + } + + // Returns {addr_units, len_units} or {0, 0} if nothing fits. + std::pair remove(size_t n_units) + { + size_t n_bytes = n_units * UNIT_SIZE; + if (n_bytes == 0 || n_bytes > Bins::max_supported_size()) + return {0, 0}; + + typename Bins::Bitmap bm{}; + std::map::iterator>> by_bin; + + for (auto it = ranges.begin(); it != ranges.end(); ++it) + { + typename Bins::range_t r{ + unit_addr(it->addr_units), it->size_units * UNIT_SIZE}; + size_t bin = bm.add(r); + by_bin[bin].push_back(it); + } + + size_t bin_id = bm.find_for_request(n_bytes); + if (bin_id == SIZE_MAX) + return {0, 0}; + + auto& entries = by_bin[bin_id]; + auto best_it = entries[0]; + for (size_t i = 1; i < entries.size(); i++) + { + if (entries[i]->addr_units < best_it->addr_units) + best_it = entries[i]; + } + + OracleRange block = *best_it; + ranges.erase(best_it); + + auto carved = Bins::carve( + {unit_addr(block.addr_units), block.size_units * UNIT_SIZE}, n_bytes); + auto base = base_addr(); + if (carved.pre.size != 0) + ranges.insert( + {(carved.pre.base - base) / UNIT_SIZE, carved.pre.size / UNIT_SIZE}); + if (carved.post.size != 0) + ranges.insert( + {(carved.post.base - base) / UNIT_SIZE, + carved.post.size / UNIT_SIZE}); + + return { + (carved.req.base - base) / UNIT_SIZE, carved.req.size / UNIT_SIZE}; + } + }; + + static void test_stress_seed(size_t seed, size_t num_ops) + { + reset_backing(); + TestArena arena; + Oracle oracle; + + // All units initially allocated (i.e., not in the arena). + std::vector allocated(STRESS_UNITS, true); + + xoroshiro::p128r64 rng(seed); + + for (size_t op = 0; op < num_ops; op++) + { + bool do_add = (rng.next() % 3) != 0; + + if (do_add) + { + size_t max_size = STRESS_UNITS / 4; + if (max_size < 1) + max_size = 1; + size_t size = (rng.next() % max_size) + 1; + size_t start = rng.next() % STRESS_UNITS; + + bool found = false; + for (size_t try_start = start; try_start < STRESS_UNITS; try_start++) + { + size_t actual = 0; + for (size_t j = try_start; j < STRESS_UNITS && j < try_start + size; + j++) + { + if (!allocated[j]) + break; + actual++; + } + if (actual >= 1) + { + size = actual; + start = try_start; + found = true; + break; + } + } + if (!found) + continue; + + for (size_t j = start; j < start + size; j++) + allocated[j] = false; + + auto result = arena.add_block(unit_addr(start), unit_size(size)); + if (result.first == 0) + oracle.add(start, size); + else + { + // Overflow: arena spilled the consolidated block back to + // the caller. Treat as if everything went back to + // "allocated"; clear the oracle. + for (size_t j = 0; j < STRESS_UNITS; j++) + allocated[j] = true; + oracle = Oracle{}; + } + arena.check_invariant(true); + } + else + { + size_t max_req = STRESS_UNITS / 4; + if (max_req < 1) + max_req = 1; + size_t n = (rng.next() % max_req) + 1; + + auto arena_addr = arena.remove_block(unit_size(n)); + auto [o_start, o_len] = oracle.remove(n); + + if (o_len == 0) + { + SNMALLOC_CHECK(arena_addr == 0); + } + else + { + SNMALLOC_CHECK(arena_addr != 0); + SNMALLOC_CHECK(arena_addr == unit_addr(o_start)); + for (size_t j = o_start; j < o_start + o_len; j++) + allocated[j] = true; + } + arena.check_invariant(true); + } + } + } + + static void test_stress() + { + constexpr size_t NUM_OPS = 500; + constexpr size_t NUM_SEEDS = 30; + for (size_t s = 1; s <= NUM_SEEDS; s++) + test_stress_seed(s, NUM_OPS); + printf(" Stress (%zu seeds x %zu ops): OK\n", NUM_SEEDS, NUM_OPS); + } + + // ================================================================== + // (G) SmallArenaRange — chunk-granularity parent + sub-chunk + // sub-allocations served by the in-band arena. + // ================================================================== + + // Pool of chunk-aligned buffers, handed out as a chunk-granularity + // parent range to SmallArenaRange. Oversized by MIN_CHUNK_SIZE so + // `pool_base()` can align up at runtime — MSVC rejects alignas + // values as large as MIN_CHUNK_SIZE on static storage. + static constexpr size_t POOL_CHUNKS = 8; + static unsigned char pool_storage[(POOL_CHUNKS + 1) * MIN_CHUNK_SIZE]; + static bool pool_in_use[POOL_CHUNKS]; + // Track returns to detect leaks / double-frees. + static size_t pool_alloc_count; + static size_t pool_dealloc_count; + + static unsigned char* pool_base() + { + uintptr_t raw = reinterpret_cast(&pool_storage[0]); + uintptr_t aligned = (raw + MIN_CHUNK_SIZE - 1) & ~(MIN_CHUNK_SIZE - 1); + return reinterpret_cast(aligned); + } + + static void reset_pool() + { + for (size_t i = 0; i < POOL_CHUNKS; i++) + pool_in_use[i] = false; + for (size_t i = 0; i < sizeof(pool_storage); i++) + pool_storage[i] = 0; + pool_alloc_count = 0; + pool_dealloc_count = 0; + } + + class MockParent + { + public: + static constexpr bool Aligned = true; + static constexpr bool ConcurrencySafe = true; + using ChunkBounds = capptr::bounds::Arena; + + constexpr MockParent() = default; + + CapPtr alloc_range(size_t size) + { + SNMALLOC_CHECK(size == MIN_CHUNK_SIZE); + for (size_t i = 0; i < POOL_CHUNKS; i++) + { + if (!pool_in_use[i]) + { + pool_in_use[i] = true; + pool_alloc_count++; + return CapPtr::unsafe_from( + pool_base() + i * MIN_CHUNK_SIZE); + } + } + return nullptr; + } + + void dealloc_range(CapPtr base, size_t size) + { + SNMALLOC_CHECK(size == MIN_CHUNK_SIZE); + auto p = static_cast(base.unsafe_ptr()); + auto idx = static_cast(p - pool_base()) / MIN_CHUNK_SIZE; + SNMALLOC_CHECK(idx < POOL_CHUNKS); + SNMALLOC_CHECK(pool_in_use[idx]); + pool_in_use[idx] = false; + pool_dealloc_count++; + } + }; + + using SmallArena = SmallArenaRange::Type; + + static void test_small_arena_basic() + { + reset_pool(); + SmallArena r; + + // First alloc triggers a refill of one chunk; the rest of the + // chunk is internally available for further sub-allocations. + auto a = r.alloc_range(UNIT_SIZE); + SNMALLOC_CHECK(a != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + auto b = r.alloc_range(unit_size(3)); + SNMALLOC_CHECK(b != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + // Non-pow2 size — the whole point of SmallArenaRange. + auto c = r.alloc_range(unit_size(5)); + SNMALLOC_CHECK(c != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(a, UNIT_SIZE); + r.dealloc_range(b, unit_size(3)); + r.dealloc_range(c, unit_size(5)); + + printf(" SmallArenaRange basic alloc/dealloc: OK\n"); + } + + static void test_small_arena_chunk_pass_through() + { + reset_pool(); + SmallArena r; + + // A chunk-or-larger alloc should pass through to the parent + // without touching the arena. + auto a = r.alloc_range(MIN_CHUNK_SIZE); + SNMALLOC_CHECK(a != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(a, MIN_CHUNK_SIZE); + SNMALLOC_CHECK(pool_dealloc_count == 1); + + printf(" SmallArenaRange chunk pass-through: OK\n"); + } + + static void test_small_arena_unaligned_dealloc() + { + reset_pool(); + SmallArena r; + + // Get some sub-chunk space populated. + auto a = r.alloc_range(unit_size(4)); + SNMALLOC_CHECK(a != nullptr); + + // Donate an unaligned spare (mirrors make()'s spare-seed + // donation). Length is not unit-aligned; sub-unit edges must + // be silently discarded. + auto unaligned_base = pointer_offset(a, 1); + r.dealloc_range(unaligned_base, unit_size(4) - 1); + + // Should not have leaked chunks to the parent (sub-chunk + // fragments stay in the arena). + SNMALLOC_CHECK(pool_dealloc_count == 0); + + printf(" SmallArenaRange unaligned dealloc: OK\n"); + } + + static void test_small_arena_consolidation_returns_chunk() + { + reset_pool(); + SmallArena r; + + // Fully consume one chunk via small allocs; record the chunk + // base so we can rebuild the full chunk via deallocs. + constexpr size_t N = MIN_CHUNK_SIZE / UNIT_SIZE; + std::vector> ps; + for (size_t i = 0; i < N; i++) + { + auto p = r.alloc_range(UNIT_SIZE); + SNMALLOC_CHECK(p != nullptr); + ps.push_back(p); + } + // We expect at least one refill happened (likely just one, + // since N units == one chunk; but in either case all + // sub-allocs come from the same backing chunk). + SNMALLOC_CHECK(pool_alloc_count >= 1); + + size_t deallocs_before = pool_dealloc_count; + for (auto p : ps) + r.dealloc_range(p, UNIT_SIZE); + + // Consolidation should reassemble the whole chunk and donate + // it back to the parent. + SNMALLOC_CHECK(pool_dealloc_count > deallocs_before); + + printf(" SmallArenaRange consolidation returns chunk: OK\n"); + } + + // alloc_size_with_align + + static void test_alloc_size_with_align_exact() + { + reset_pool(); + SmallArena r; + + size_t size = unit_size(4); + size_t align = UNIT_SIZE; + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + r.dealloc_range(p, size); + printf(" alloc_size_with_align exact (no leftover): OK\n"); + } + + static void test_alloc_size_with_align_pow2_align_over_size() + { + reset_pool(); + SmallArena r; + + size_t size = unit_size(3) + 2; + size_t align = 256; + SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE); + SNMALLOC_CHECK(align >= UNIT_SIZE); + SNMALLOC_CHECK(bits::is_pow2(align)); + + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + size_t requested = bits::align_up(size, align); + SNMALLOC_CHECK(requested - used > 0); + + // Donated tail and the carved-but-unused chunk remainder both + // sit in the arena, so the follow-up alloc must succeed + // without a second parent refill — exact address is not + // pinned down. + auto tail = r.alloc_range(requested - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(p, used); + r.dealloc_range(tail, requested - used); + printf(" alloc_size_with_align pow2 align over non-pow2 size: OK\n"); + } + + static void test_alloc_size_with_align_align_larger_than_size() + { + reset_pool(); + SmallArena r; + + // User's motivating example, scaled into the test arena. + size_t align = 4096; + SNMALLOC_CHECK(align <= MIN_CHUNK_SIZE); + size_t size = align - 254; + + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + auto tail = r.alloc_range(align - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(pool_alloc_count == 1); + + r.dealloc_range(p, used); + r.dealloc_range(tail, align - used); + printf(" alloc_size_with_align align > size: OK\n"); + } + + static void test_alloc_size_with_align_chunk_bypass() + { + reset_pool(); + SmallArena r; + + size_t size = MIN_CHUNK_SIZE - 100; + size_t align = MIN_CHUNK_SIZE; + auto p = r.alloc_size_with_align(size, align); + SNMALLOC_CHECK(p != nullptr); + SNMALLOC_CHECK((address_cast(p) & (align - 1)) == 0); + SNMALLOC_CHECK(pool_alloc_count == 1); + + // requested == MIN_CHUNK_SIZE bypasses to parent (whole chunk, + // no carve-time leftover), so the only free arena fragment is + // the donated tail — pin its exact address. Tail stays + // intra-chunk, so no dealloc to parent. + SNMALLOC_CHECK(pool_dealloc_count == 0); + + size_t used = bits::align_up(size, UNIT_SIZE); + if (used < MIN_CHUNK_SIZE) + { + auto tail = r.alloc_range(MIN_CHUNK_SIZE - used); + SNMALLOC_CHECK(tail != nullptr); + SNMALLOC_CHECK(address_cast(tail) == address_cast(p) + used); + r.dealloc_range(tail, MIN_CHUNK_SIZE - used); + } + r.dealloc_range(p, used); + + printf(" alloc_size_with_align chunk-sized bypass: OK\n"); + } +} // namespace snmalloc + +int main() +{ + printf("--- InplaceRep tests ---\n"); + printf( + " UNIT_SIZE=%zu, MIN_BITS=%zu, MAX_SIZE_BITS=%zu, STRESS_UNITS=%zu\n", + snmalloc::UNIT_SIZE, + snmalloc::MIN_BITS, + snmalloc::MAX_SIZE_BITS, + snmalloc::STRESS_UNITS); + + printf("(A) Accessor round-trips:\n"); + snmalloc::test_variant_roundtrip(); + snmalloc::test_large_size_roundtrip(); + + printf("(B) Red bits independent:\n"); + snmalloc::test_red_bits_independent(); + snmalloc::test_can_consolidate_chunk_boundary(); + + printf("(C) Arena add/remove:\n"); + snmalloc::test_arena_add_remove_single(); + + printf("(D) Arena consolidation:\n"); + snmalloc::test_arena_consolidation(); + + printf("(E) Arena carve:\n"); + snmalloc::test_arena_carve(); + + printf("(F) Stress:\n"); + snmalloc::test_stress(); + + printf("(G) SmallArenaRange:\n"); + snmalloc::test_small_arena_basic(); + snmalloc::test_small_arena_chunk_pass_through(); + snmalloc::test_small_arena_unaligned_dealloc(); + snmalloc::test_small_arena_consolidation_returns_chunk(); + snmalloc::test_alloc_size_with_align_exact(); + snmalloc::test_alloc_size_with_align_pow2_align_over_size(); + snmalloc::test_alloc_size_with_align_align_larger_than_size(); + snmalloc::test_alloc_size_with_align_chunk_bypass(); + + printf("All InplaceRep tests passed.\n"); + return 0; +} From 43ef58ce98bd7140366a6be04654ff92eda2eb34 Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Tue, 9 Jun 2026 09:25:03 +0100 Subject: [PATCH 13/15] Wire SmallArenaRange in as the LocalMetaRange * StandardLocalState and MetaProtectedRangeLocalState gain an Authmap template parameter, plumbed through alongside Pagemap. Both configs and the domestication test pass their Authmap into the LocalState instantiation. * The three SmallBuddyRange uses in the meta-range pipes are replaced with SmallArenaRange. * Each LocalState publishes MIN_META_ALIGN (= MetaRange::UNIT_SIZE) so the backend can round metadata sizes to the meta range's natural granularity instead of the next power of two. The backend gains a meta_size_round helper that pads to MIN_META_ALIGN, stepping up to MIN_CHUNK_SIZE for requests that would bypass the small range to the parent LargeArenaRange. All four metadata-allocation sites (alloc_meta_data x2, alloc_chunk, dealloc_chunk) use this helper; alloc and dealloc must agree on the rounding so the meta range's dealloc_range assertions hold. * The previous bits::next_pow2 rounding was a leftover from the buddy era. With a ClientMetaDataProvider whose per-slab storage is non-pow2 (e.g. allocation bitmap + small fixed header), the pow2 rounding doubled the metadata overhead; meta_size_round eliminates that. * FixedRangeConfig's inline Authmap gains amplify_from_address (the new SmallArenaRange path needs it). A new test func/client_meta_nonpow2 installs a ClientMetaDataProvider whose per-slab storage is non-pow2 and exercises alloc/dealloc round-tripping across several sizeclasses; any disagreement between alloc-side and dealloc-side rounding would trip the meta range's dealloc_range assertions. SmallBuddyRange.h is now orphaned but stays in tree until the cleanup commit removes it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/snmalloc/backend/backend.h | 34 +++++-- src/snmalloc/backend/fixedglobalconfig.h | 9 +- src/snmalloc/backend/globalconfig.h | 4 +- src/snmalloc/backend/meta_protected_range.h | 13 ++- src/snmalloc/backend/standard_range.h | 11 ++- .../client_meta_nonpow2.cc | 93 +++++++++++++++++++ src/test/func/domestication/domestication.cc | 2 +- 7 files changed, 150 insertions(+), 16 deletions(-) create mode 100644 src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc diff --git a/src/snmalloc/backend/backend.h b/src/snmalloc/backend/backend.h index 2fcdf2a57..5f8a0aca6 100644 --- a/src/snmalloc/backend/backend.h +++ b/src/snmalloc/backend/backend.h @@ -23,7 +23,27 @@ namespace snmalloc using Pal = PAL; using SlabMetadata = typename PagemapEntry::SlabMetadata; - public: + /** + * Round a metadata allocation size to a value the meta range can + * service. + * + * - Pads to `LocalState::MIN_META_ALIGN` so that the in-band small + * meta range (`SmallArenaRange`) accepts it. + * - If the result reaches `MIN_CHUNK_SIZE`, the request will bypass + * the small range to the parent `LargeArenaRange`, which requires + * `MIN_CHUNK_SIZE` alignment; step up to satisfy that. + * + * Alloc and dealloc sites MUST share this helper so a chunk's + * metadata is freed at the same size it was allocated. + */ + SNMALLOC_FAST_PATH static size_t meta_size_round(size_t size) + { + size_t r = bits::align_up(size, LocalState::MIN_META_ALIGN); + if (r >= MIN_CHUNK_SIZE) + r = bits::align_up(r, MIN_CHUNK_SIZE); + return r; + } + /** * Provide a block of meta-data with size and align. * @@ -46,7 +66,8 @@ namespace snmalloc if (local_state != nullptr) { - p = local_state->get_meta_range().alloc_range_with_leftover(size); + auto& meta_range = local_state->get_meta_range(); + p = meta_range.alloc_range(meta_size_round(size)); } else { @@ -54,7 +75,7 @@ namespace snmalloc GlobalMetaRange::ConcurrencySafe, "Global meta data range needs to be concurrency safe."); GlobalMetaRange global_state; - p = global_state.alloc_range(bits::next_pow2(size)); + p = global_state.alloc_range(meta_size_round(size)); } if (p == nullptr) @@ -106,7 +127,7 @@ namespace snmalloc // Calculate the extra bytes required to store the client meta-data. size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass); - auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes); + auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes); #ifdef SNMALLOC_TRACING message<1024>( @@ -156,8 +177,7 @@ namespace snmalloc SNMALLOC_ASSERT(slab_index < (size_t{1} << OFFSET_BITS)); const uintptr_t ras_i = ras | (slab_index << SIZECLASS_BITS); typename Pagemap::Entry t_i(meta, ras_i); - Pagemap::set_metaentry( - address_cast(p) + chunk_offset, slab_size, t_i); + Pagemap::set_metaentry(address_cast(p) + chunk_offset, slab_size, t_i); } return {Aal::capptr_bound(p, size), meta}; @@ -207,7 +227,7 @@ namespace snmalloc // Calculate the extra bytes required to store the client meta-data. size_t extra_bytes = SlabMetadata::get_extra_bytes(sizeclass); - auto meta_size = bits::next_pow2(sizeof(SlabMetadata) + extra_bytes); + auto meta_size = meta_size_round(sizeof(SlabMetadata) + extra_bytes); local_state.get_meta_range().dealloc_range( capptr::Arena::unsafe_from(&slab_metadata), meta_size); diff --git a/src/snmalloc/backend/fixedglobalconfig.h b/src/snmalloc/backend/fixedglobalconfig.h index 5bd3b68b5..94c3c67f1 100644 --- a/src/snmalloc/backend/fixedglobalconfig.h +++ b/src/snmalloc/backend/fixedglobalconfig.h @@ -39,10 +39,17 @@ namespace snmalloc { return Aal::capptr_rebound(arena, c); } + + template + static SNMALLOC_FAST_PATH capptr::Arena + amplify_from_address(address_t a) + { + return pointer_offset(arena, a - address_cast(arena)); + } }; public: - using LocalState = StandardLocalState; + using LocalState = StandardLocalState; using GlobalPoolState = PoolState>; diff --git a/src/snmalloc/backend/globalconfig.h b/src/snmalloc/backend/globalconfig.h index 208210b65..9bdada06c 100644 --- a/src/snmalloc/backend/globalconfig.h +++ b/src/snmalloc/backend/globalconfig.h @@ -68,8 +68,8 @@ namespace snmalloc */ using LocalState = stl::conditional_t< mitigations(metadata_protection), - MetaProtectedRangeLocalState, - StandardLocalState>; + MetaProtectedRangeLocalState, + StandardLocalState>; /** * Use the default backend. diff --git a/src/snmalloc/backend/meta_protected_range.h b/src/snmalloc/backend/meta_protected_range.h index 9b4ca756b..76916e82f 100644 --- a/src/snmalloc/backend/meta_protected_range.h +++ b/src/snmalloc/backend/meta_protected_range.h @@ -24,6 +24,7 @@ namespace snmalloc template< typename PAL, typename Pagemap, + typename Authmap, typename Base, size_t MinSizeBits = MinBaseSizeBits()> struct MetaProtectedRangeLocalState : BaseLocalStateConstants @@ -104,13 +105,19 @@ namespace snmalloc LocalCacheSizeBits - SubRangeRatioBits, bits::BITS - 1, Pagemap>, - SmallBuddyRange>; + SmallArenaRange>; ObjectRange object_range; MetaRange meta_range; public: + /// Granularity of the local meta range. Backend rounds metadata + /// allocation sizes up to this; replaces pow2 rounding. + static constexpr size_t MIN_META_ALIGN = MetaRange::UNIT_SIZE; + static_assert( + bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two"); + using Stats = StatsCombiner; ObjectRange* get_object_range() @@ -124,9 +131,9 @@ namespace snmalloc } // Create global range that can service small meta-data requests. - // Don't want to add the SmallBuddyRange to the CentralMetaRange as that + // Don't want to add the SmallArenaRange to the CentralMetaRange as that // would require committing memory inside the main global lock. using GlobalMetaRange = - Pipe; + Pipe, GlobalRange>; }; } // namespace snmalloc diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index b67a386d9..98259dd9f 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -22,6 +22,7 @@ namespace snmalloc template< typename PAL, typename Pagemap, + typename Authmap, typename Base = EmptyRange<>, size_t MinSizeBits = MinBaseSizeBits()> struct StandardLocalState : BaseLocalStateConstants @@ -56,18 +57,24 @@ namespace snmalloc page_size_bits>>>; private: - using ObjectRange = Pipe; + using ObjectRange = Pipe>; ObjectRange object_range; public: + /// Granularity of the local meta range. Backend rounds metadata + /// allocation sizes up to this; replaces pow2 rounding. + static constexpr size_t MIN_META_ALIGN = ObjectRange::UNIT_SIZE; + static_assert( + bits::is_pow2(MIN_META_ALIGN), "MIN_META_ALIGN must be a power of two"); + // Expose a global range for the initial allocation of meta-data. using GlobalMetaRange = Pipe; /** * Where we turn for allocations of user chunks. * - * Reach over the SmallBuddyRange that's at the near end of the ObjectRange + * Reach over the SmallArenaRange that's at the near end of the ObjectRange * pipe, rather than having that range adapter dynamically branch to its * parent. */ diff --git a/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc new file mode 100644 index 000000000..70d6ab928 --- /dev/null +++ b/src/test/func/client_meta_nonpow2/client_meta_nonpow2.cc @@ -0,0 +1,93 @@ +/** + * Exercises the slab metadata allocation path with a ClientMetaDataProvider + * whose per-slab extra_bytes is non-power-of-two. + * + * The backend rounds slab metadata sizes to `MIN_META_ALIGN` (= the meta + * range's UNIT_SIZE) rather than the next power of two, so a non-pow2 + * client meta size actually occupies a non-pow2 slab metadata block. + * This test gates the alloc/dealloc round-trip on that path: if + * `meta_size_round` is wrong, an inconsistent alloc/dealloc size would + * either trip an assertion in the meta range or leak. + */ + +#include "test/setup.h" + +#include +#include +#include +#include + +namespace snmalloc +{ + /** + * Per-slab client meta: `max_count + 7` bytes of storage. With + * `StorageType = uint8_t`, the resulting extra_bytes + * (= (required_count - 1) * 1) is non-power-of-two for typical + * sizeclass slab object counts. + */ + struct NonPow2ClientMetaDataProvider + { + using StorageType = uint8_t; + using DataRef = uint8_t&; + + static size_t required_count(size_t max_count) + { + return max_count + 7; + } + + static DataRef get(StorageType* base, size_t index) + { + return base[index]; + } + }; + + using Config = + snmalloc::StandardConfigClientMeta; +} // namespace snmalloc + +#define SNMALLOC_PROVIDE_OWN_CONFIG +#include + +int main() +{ +#if defined(SNMALLOC_ENABLE_GWP_ASAN_INTEGRATION) + // This test does not make sense in GWP-ASan mode. + return 0; +#else + // Spread allocations across several small sizeclasses to force a + // variety of slab metadata sizes; each combination of (slab object + // count, +7 bytes) produces a different non-pow2 extra_bytes. + constexpr size_t sizes[] = {16, 48, 96, 192, 512, 1024}; + std::vector> ptrs; + + for (size_t round = 0; round < 5; round++) + { + for (size_t s : sizes) + { + for (size_t i = 0; i < 200; i++) + { + auto p = snmalloc::libc::malloc(s); + auto& meta = snmalloc::get_client_meta_data(p); + uint8_t tag = static_cast((round * 31 + s + i) & 0xff); + meta = tag; + memset(p, tag, s); + ptrs.emplace_back(p, tag); + } + } + } + + for (auto [p, tag] : ptrs) + { + auto& meta = snmalloc::get_client_meta_data(p); + if (meta != tag) + { + std::cout << "Meta mismatch: expected " << int(tag) << " got " + << int(meta) << std::endl; + abort(); + } + snmalloc::libc::free(p); + } + + return 0; +#endif +} diff --git a/src/test/func/domestication/domestication.cc b/src/test/func/domestication/domestication.cc index 1c2eb9fef..63b8b380d 100644 --- a/src/test/func/domestication/domestication.cc +++ b/src/test/func/domestication/domestication.cc @@ -39,7 +39,7 @@ namespace snmalloc PagemapRegisterRange, PagemapRegisterRange>; - using LocalState = StandardLocalState; + using LocalState = StandardLocalState; using GlobalPoolState = PoolState>; From f8730745d7e011ee27faa2bb93893b7e992036dc Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 12 Jun 2026 13:38:55 +0100 Subject: [PATCH 14/15] Remove buddy allocators LargeBuddyRange, SmallBuddyRange and their shared buddy.h are no longer reachable now that SmallArenaRange owns the metadata path and LargeArenaRange owns the large-range path. Delete them (-848 lines) and clean up stale references in comments, README, AddressSpace.md, and the MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY constant (renamed ..._CACHE since the per-thread cache it gates is no longer specifically a buddy). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/AddressSpace.md | 43 +- src/snmalloc/README.md | 2 +- src/snmalloc/backend/fixedglobalconfig.h | 2 +- src/snmalloc/backend/standard_range.h | 4 +- .../backend_helpers/backend_helpers.h | 3 - src/snmalloc/backend_helpers/buddy.h | 199 --------- .../backend_helpers/largebuddyrange.h | 397 ------------------ .../backend_helpers/smallbuddyrange.h | 252 ----------- .../backend_helpers/staticconditionalrange.h | 4 +- src/snmalloc/mitigations/allocconfig.h | 9 +- src/test/func/cheri/cheri.cc | 4 +- 11 files changed, 32 insertions(+), 887 deletions(-) delete mode 100644 src/snmalloc/backend_helpers/buddy.h delete mode 100644 src/snmalloc/backend_helpers/largebuddyrange.h delete mode 100644 src/snmalloc/backend_helpers/smallbuddyrange.h diff --git a/docs/AddressSpace.md b/docs/AddressSpace.md index 1e28491ee..030023513 100644 --- a/docs/AddressSpace.md +++ b/docs/AddressSpace.md @@ -26,14 +26,14 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua Because the two exercise similar bits of machinery, we now track them in parallel in prose despite their sequential nature. 4. The `BackendAllocator` has a chain of "range" types that it uses to manage address space. - By default (and in the case we are considering), that chain begins with a per-thread "small buddy allocator range". + By default (and in the case we are considering), that chain begins with a per-thread *small arena range*. 1. For the metadata allocation, the size is (well) below `MIN_CHUNK_SIZE` and so this allocator, which by supposition is empty, attempts to `refill` itself from its parent. This results in a request for a `MIN_CHUNK_SIZE` chunk from the parent allocator. 2. For the chunk allocation, the size is `MIN_CHUNK_SIZE` or larger, so this allocator immediately forwards the request to its parent. -5. The next range allocator in the chain is a per-thread *large* buddy allocator that refills in 2 MiB granules. +5. The next range allocator in the chain is a per-thread `LargeArenaRange` that refills in 2 MiB granules. (2 MiB chosen because it is a typical superpage size.) At this point, both requests are for at least one and no more than a few times `MIN_CHUNK_SIZE` bytes. @@ -48,7 +48,7 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua 8. The next entry in the chain is a `StatsRange` which serves to accumulate statistics. We ignore this stage and continue onwards. -9. The next entry in the chain is another *large* buddy allocator which refills at 16 MiB but can hold regions +9. The next entry in the chain is another `LargeArenaRange` which refills at 16 MiB but can hold regions of any size up to the entire address space. The first request triggers a `refill`, continuing along the chain as a 16 MiB request. (Recall that the second allocation will be handled at an earlier point on the chain.) @@ -61,15 +61,15 @@ For simplicity, we gloss over much of the "lazy initialization" that would actua 12. Having wound the chain onto our stack, we now unwind! The `PagemapRegisterRange` ensures that the Pagemap entries for allocations passing through it are mapped and returns the allocation unaltered. -13. The global large buddy allocator splits the 16 MiB refill into 8, 4, and 2 MiB regions it retains as well as returning the remaining 2 MiB back along the chain. +13. The global `LargeArenaRange` carves the request out of its 16 MiB refill and keeps the unused remainder as a single free block in its internal red-black trees of free ranges, returning the carved portion back along the chain. 14. The `StatsRange` makes its observations, the `GlobalRange` now unlocks the global component of the chain, and the `CommitRange` ensures that the allocation is mapped. Aside from these side effects, these propagate the allocation along the chain unaltered. -15. We now arrive back at the thread-local large buddy allocator, which takes its 2 MiB refill and breaks it down into powers of two down to the requested `MIN_CHUNK_SIZE`. - The second allocation (of the chunk), will either return or again break down one of these intermediate chunks. +15. We now arrive back at the thread-local `LargeArenaRange`, which takes its 2 MiB refill and carves out the requested chunk(s); the unused remainder stays in its free-range trees. + The second allocation (of the chunk) will either be satisfied from this leftover or trigger another carve. -16. For the first (metadata) allocation, the thread-local *small* allocator breaks the `MIN_CHUNK_SIZE` allocation down into powers of two down to `PAGEMAP_METADATA_STRUCT_SIZE` and returns one of that size. +16. For the first (metadata) allocation, the thread-local *small arena range* takes its `MIN_CHUNK_SIZE` refill, hands back a sub-chunk fragment large enough for `PAGEMAP_METADATA_STRUCT_SIZE`, and tracks the remainder as free sub-chunk space using tree nodes stored inside the free fragments themselves. The second allocation will have been forwarded and so is not additionally handled here. Exciting, no? @@ -98,26 +98,19 @@ For chunks owned by the *frontend* (`REMOTE_BACKEND_MARKER` not asserted), 2. A bit (`META_BOUNDARY_BIT`) that serves to limit chunk coalescing on platforms where that may not be possible, such as CHERI. -See `src/backend/metatypes.h` and `src/mem/metaslab.h`. +See `src/snmalloc/mem/metadata.h`. For chunks owned by a *backend* (`REMOTE_BACKEND_MARKER` asserted), there are again multiple possibilities. -For chunks owned by a *small buddy allocator*, the remainder of the `MetaEntry` is zero. +For chunks owned by a *small arena range* (`SmallArenaRange`), the remainder of the `MetaEntry` is zero. That is, it appears to have small sizeclass 0 and an implausible `RemoteAllocator*`. +The free-fragment tree itself is stored in-band, inside the free space of the chunk, rather than in the pagemap (see `InplaceRep` in `src/snmalloc/backend_helpers/inplacerep.h`). -For chunks owned by a *large buddy allocator*, the `MetaEntry` is instead a node in a red-black tree of all such chunks. -Its contents can be decoded as follows: +For chunks owned by a `LargeArenaRange`, the `MetaEntry` is instead a node in the red-black trees of free ranges. +A free block of *N* units consumes the `MetaEntry`s of its first *min(N, 3)* unit-aligned addresses; their words encode the bin-tree node (unit 0), the range-tree node (unit 1, for blocks of two or more units), and the large-chunk count (unit 2, for blocks of three or more units). +The pagemap reserves the low `MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT` bits of each word for the meta-entry layout itself; the tree-node encoding (left/right pointers, red bit, variant tag, large-size count) lives at or above that bit. -1. The `meta` field's `META_BOUNDARY_BIT` is preserved, with the same meaning as in the frontend case, above. - -2. `meta` (resp. `remote_and_sizeclass`) includes a pointer to the left (resp. right) *chunk* of address space. - (The corresponding child *node* in this tree is found by taking the *address* of this chunk and looking up the `MetaEntry` in the Pagemap. - This trick of pointing at the child's chunk rather than at the child `MetaEntry` is particularly useful on CHERI: - it allows us to capture the authority to the chunk without needing another pointer and costs just a shift and add.) - -3. The `meta` field's `LargeBuddyRep::RED_BIT` is used to carry the red/black color of this node. - -See `src/backend/largebuddyrange.h`. +See `PagemapRep` in `src/snmalloc/backend_helpers/largearenarange.h`. ### Encoding a MetaEntry @@ -131,18 +124,20 @@ The following cases apply: * has "small" sizeclass 0, which has size 0. * has no associated metadata structure. -2. The address is part of a free chunk in a backend's Large Buddy Allocator: +2. The address is part of a free chunk in a backend `LargeArenaRange`: The `MetaEntry`... * has `REMOTE_BACKEND_MARKER` asserted in `remote_and_sizeclass`. * has "small" sizeclass 0, which has size 0. - * the remainder of its `MetaEntry` structure will be a Large Buddy Allocator rbtree node. + * the remainder of its `MetaEntry` structure (and those of the next one or two unit-aligned `MetaEntry`s if the free block spans them) carries the `Arena`'s red-black-tree node encoding. * has no associated metadata structure. -3. The address is part of a free chunk inside a backend's Small Buddy Allocator: +3. The address is part of a free fragment inside a backend `SmallArenaRange`: Here, the `MetaEntry` is zero aside from the asserted `REMOTE_BACKEND_MARKER` bit, and so it... * has "small" sizeclass 0, which has size 0. * has no associated metadata structure. + The tree of free sub-chunk fragments for this chunk is stored inside the free fragments themselves (`InplaceRep`), not in the pagemap. + 4. The address is part of a live large allocation (spanning one or more 16KiB chunks): Here, the `MetaEntry`... * has `REMOTE_BACKEND_MARKER` clear in `remote_and_sizeclass`. diff --git a/src/snmalloc/README.md b/src/snmalloc/README.md index 2549320fb..f598f8171 100644 --- a/src/snmalloc/README.md +++ b/src/snmalloc/README.md @@ -20,7 +20,7 @@ These are arranged in a hierarchy such that each of the directories may include - `mem/` provides the core allocator abstractions. The code here is templated over a back-end, which defines a particular embedding of snmalloc. - `backend_helpers/` provides helper classes for use in defining a back end. - This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and buddy allocators for managing address-space ranges. + This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and range allocators for managing address-space ranges. - `backend/` provides some example implementations for snmalloc embeddings that provide a global memory allocator for an address space. Users may ignore this entirely and use the types in `mem/` with a custom back end to expose an snmalloc instance with specific behaviour. Layers above this can be used with a custom configuration by defining `SNMALLOC_PROVIDE_OWN_CONFIG` and exporting a type as `snmalloc::Config` that defines the configuration. diff --git a/src/snmalloc/backend/fixedglobalconfig.h b/src/snmalloc/backend/fixedglobalconfig.h index 94c3c67f1..68b41f860 100644 --- a/src/snmalloc/backend/fixedglobalconfig.h +++ b/src/snmalloc/backend/fixedglobalconfig.h @@ -93,7 +93,7 @@ namespace snmalloc Pagemap::concretePagemap.init(base, length); // Make this a alloc_config constant. - if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY) + if (length < MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE) { LocalState::set_small_heap(); } diff --git a/src/snmalloc/backend/standard_range.h b/src/snmalloc/backend/standard_range.h index 98259dd9f..f46e6085d 100644 --- a/src/snmalloc/backend/standard_range.h +++ b/src/snmalloc/backend/standard_range.h @@ -46,8 +46,8 @@ namespace snmalloc bits::next_pow2_bits_const(PAL::page_size); public: - // Source for object allocations and metadata - // Use buddy allocators to cache locally. + // Source for object allocations and metadata; thread-local cache + // for chunk-sized ranges. using LargeObjectRange = Pipe< Stats, StaticConditionalRange - class Buddy - { - static_assert(MAX_SIZE_BITS > MIN_SIZE_BITS); - - struct Entry - { - typename Rep::Contents cache[3]; - RBTree tree{}; - }; - - stl::Array entries{}; - // All RBtrees at or above this index should be empty. - size_t empty_at_or_above{0}; - - size_t to_index(size_t size) - { - SNMALLOC_ASSERT(size != 0); - SNMALLOC_ASSERT(bits::is_pow2(size)); - auto log = snmalloc::bits::next_pow2_bits(size); - SNMALLOC_ASSERT_MSG( - log >= MIN_SIZE_BITS, "Size too big: {} log {}.", size, log); - SNMALLOC_ASSERT_MSG( - log < MAX_SIZE_BITS, "Size too small: {} log {}.", size, log); - - return log - MIN_SIZE_BITS; - } - - void validate_block(typename Rep::Contents addr, size_t size) - { - SNMALLOC_ASSERT(bits::is_pow2(size)); - SNMALLOC_ASSERT(addr == Rep::align_down(addr, size)); - UNUSED(addr, size); - } - - void invariant() - { -#ifndef NDEBUG - for (size_t i = empty_at_or_above; i < entries.size(); i++) - { - SNMALLOC_ASSERT(entries[i].tree.is_empty()); - // TODO check cache is empty - } -#endif - } - - bool remove_buddy(typename Rep::Contents addr, size_t size) - { - auto idx = to_index(size); - - // Empty at this range. - if (idx >= empty_at_or_above) - return false; - - auto buddy = Rep::buddy(addr, size); - - // Check local cache first - for (auto& e : entries[idx].cache) - { - if (Rep::equal(buddy, e)) - { - if (!Rep::can_consolidate(addr, size)) - return false; - - e = entries[idx].tree.remove_min(); - return true; - } - } - - auto path = entries[idx].tree.get_root_path(); - bool contains_buddy = entries[idx].tree.find(path, buddy); - - if (!contains_buddy) - return false; - - // Only check if we can consolidate after we know the buddy is in - // the buddy allocator. This is required to prevent possible segfaults - // from looking at the buddies meta-data, which we only know exists - // once we have found it in the red-black tree. - if (!Rep::can_consolidate(addr, size)) - return false; - - entries[idx].tree.remove_path(path); - return true; - } - - public: - constexpr Buddy() = default; - - /** - * Add a block to the buddy allocator. - * - * Blocks needs to be power of two size and aligned to the same power of - * two. - * - * Returns null, if the block is successfully added. Otherwise, returns the - * consolidated block that is MAX_SIZE_BITS big, and hence too large for - * this allocator. - */ - typename Rep::Contents add_block(typename Rep::Contents addr, size_t size) - { - validate_block(addr, size); - - if (remove_buddy(addr, size)) - { - // Add to next level cache - size *= 2; - addr = Rep::align_down(addr, size); - if (size == bits::one_at_bit(MAX_SIZE_BITS)) - { - // Invariant should be checked on all non-tail return paths. - // Holds trivially here with current design. - invariant(); - // Too big for this buddy allocator. - return addr; - } - return add_block(addr, size); - } - - auto idx = to_index(size); - empty_at_or_above = bits::max(empty_at_or_above, idx + 1); - - for (auto& e : entries[idx].cache) - { - if (Rep::equal(Rep::null, e)) - { - e = addr; - return Rep::null; - } - } - - auto path = entries[idx].tree.get_root_path(); - entries[idx].tree.find(path, addr); - entries[idx].tree.insert_path(path, addr); - invariant(); - return Rep::null; - } - - /** - * Removes a block of size from the buddy allocator. - * - * Return Rep::null if this cannot be satisfied. - */ - typename Rep::Contents remove_block(size_t size) - { - invariant(); - auto idx = to_index(size); - if (idx >= empty_at_or_above) - return Rep::null; - - auto addr = entries[idx].tree.remove_min(); - for (auto& e : entries[idx].cache) - { - if (Rep::equal(Rep::null, addr) || Rep::compare(e, addr)) - { - addr = stl::exchange(e, addr); - } - } - - if (addr != Rep::null) - { - validate_block(addr, size); - return addr; - } - - if (size * 2 == bits::one_at_bit(MAX_SIZE_BITS)) - // Too big for this buddy allocator - return Rep::null; - - auto bigger = remove_block(size * 2); - if (bigger == Rep::null) - { - empty_at_or_above = idx; - invariant(); - return Rep::null; - } - - auto second = Rep::offset(bigger, size); - - // Split large block - add_block(second, size); - return bigger; - } - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/largebuddyrange.h b/src/snmalloc/backend_helpers/largebuddyrange.h deleted file mode 100644 index 3eb5f5c21..000000000 --- a/src/snmalloc/backend_helpers/largebuddyrange.h +++ /dev/null @@ -1,397 +0,0 @@ -#pragma once - -#include "../ds/ds.h" -#include "../mem/mem.h" -#include "buddy.h" -#include "empty_range.h" -#include "range_helpers.h" - -namespace snmalloc -{ - /** - * Class for using the pagemap entries for the buddy allocator. - */ - template - class BuddyChunkRep - { - public: - /* - * The values we store in our rbtree are the addresses of (combined spans - * of) chunks of the address space; as such, bits in (MIN_CHUNK_SIZE - 1) - * are unused and so the RED_BIT is packed therein. However, in practice, - * these are not "just any" uintptr_t-s, but specifically the uintptr_t-s - * inside the Pagemap's BackendAllocator::Entry structures. - * - * The BackendAllocator::Entry provides us with helpers that guarantee that - * we use only the bits that we are allowed to. - * @{ - */ - using Handle = MetaEntryBase::BackendStateWordRef; - using Contents = uintptr_t; - ///@} - - /** - * The bit that we will use to mark an entry as red. - * This has constraints in two directions, it must not be one of the - * reserved bits from the perspective of the meta entry and it must not be - * a bit that is a valid part of the address of a chunk. - * @{ - */ - static constexpr address_t RED_BIT = address_t(1) - << MetaEntryBase::BACKEND_LAYOUT_FIRST_FREE_BIT; - - static_assert(RED_BIT < MIN_CHUNK_SIZE); - static_assert(MetaEntryBase::is_backend_allowed_value( - MetaEntryBase::Word::One, RED_BIT)); - static_assert(MetaEntryBase::is_backend_allowed_value( - MetaEntryBase::Word::Two, RED_BIT)); - ///@} - - /// The value of a null node, as returned by `get` - static constexpr Contents null = 0; - /// The value of a null node, as stored in a `uintptr_t`. - static constexpr Contents root = 0; - - /** - * Set the value. Preserve the red/black colour. - */ - static void set(Handle ptr, Contents r) - { - ptr = r | (static_cast(ptr.get()) & RED_BIT); - } - - /** - * Returns the value, stripping out the red/black colour. - */ - static Contents get(const Handle ptr) - { - return ptr.get() & ~RED_BIT; - } - - /** - * Returns a pointer to the tree node for the specified address. - */ - static Handle ref(bool direction, Contents k) - { - // Special case for accessing the null entry. We want to make sure - // that this is never modified by the back end, so we make it point to - // a constant entry and use the MMU to trap even in release modes. - // The mask passed to the handle is irrelevant: the null entry is - // never written (any attempt would trap), and on read its underlying - // value is zero so `get()` returns zero regardless of the mask. - static const Contents null_entry = 0; - if (SNMALLOC_UNLIKELY(address_cast(k) == 0)) - { - return {const_cast(&null_entry), 0}; - } - auto& entry = Pagemap::template get_metaentry_mut(address_cast(k)); - if (direction) - return entry.get_backend_word(Pagemap::Entry::Word::One); - - return entry.get_backend_word(Pagemap::Entry::Word::Two); - } - - static bool is_red(Contents k) - { - return (ref(true, k).get() & RED_BIT) == RED_BIT; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto v = ref(true, k); - v = v.get() ^ RED_BIT; - } - SNMALLOC_ASSERT(is_red(k) == new_is_red); - } - - static Contents offset(Contents k, size_t size) - { - return k + size; - } - - static Contents buddy(Contents k, size_t size) - { - return k ^ size; - } - - static Contents align_down(Contents k, size_t size) - { - return k & ~(size - 1); - } - - static bool compare(Contents k1, Contents k2) - { - return k1 > k2; - } - - static bool equal(Contents k1, Contents k2) - { - return k1 == k2; - } - - static uintptr_t printable(Contents k) - { - return k; - } - - /** - * Convert the pointer wrapper into something that the snmalloc debug - * printing code can print. - */ - static address_t printable(Handle k) - { - return k.printable_address(); - } - - /** - * Returns the name for use in debugging traces. Not used in normal builds - * (release or debug), only when tracing is enabled. - */ - static const char* name() - { - return "BuddyChunkRep"; - } - - static bool can_consolidate(Contents k, size_t size) - { - // Need to know both entries exist in the pagemap. - // This must only be called if that has already been - // ascertained. - // The buddy could be in a part of the pagemap that has - // not been registered and thus could segfault on access. - auto larger = bits::max(k, buddy(k, size)); - auto& entry = - Pagemap::template get_metaentry_mut(address_cast(larger)); - return !entry.is_boundary(); - } - }; - - /** - * Used to represent a consolidating range of memory. Uses a buddy allocator - * to consolidate adjacent blocks. - * - * ParentRange - Represents the range to get memory from to fill this range. - * - * REFILL_SIZE_BITS - Maximum size of a refill, may ask for less during warm - * up phase. - * - * MAX_SIZE_BITS - Maximum size that this range will store. - * - * Pagemap - How to access the pagemap, which is used to store the red black - * tree nodes for the buddy allocators. - * - * MIN_REFILL_SIZE_BITS - The minimum size that the ParentRange can be asked - * for - */ - template< - size_t REFILL_SIZE_BITS, - size_t MAX_SIZE_BITS, - SNMALLOC_CONCEPT(IsWritablePagemap) Pagemap, - size_t MIN_REFILL_SIZE_BITS = 0> - class LargeBuddyRange - { - static_assert( - REFILL_SIZE_BITS <= MAX_SIZE_BITS, "REFILL_SIZE_BITS > MAX_SIZE_BITS"); - static_assert( - MIN_REFILL_SIZE_BITS <= REFILL_SIZE_BITS, - "MIN_REFILL_SIZE_BITS > REFILL_SIZE_BITS"); - - /** - * Maximum size of a refill - */ - static constexpr size_t REFILL_SIZE = bits::one_at_bit(REFILL_SIZE_BITS); - - /** - * Minimum size of a refill - */ - static constexpr size_t MIN_REFILL_SIZE = - bits::one_at_bit(MIN_REFILL_SIZE_BITS); - - public: - template> - class Type : public ContainsParent - { - using ContainsParent::parent; - - /** - * The size of memory requested so far. - * - * This is used to determine the refill size. - */ - size_t requested_total = 0; - - /** - * Buddy allocator used to represent this range of memory. - */ - Buddy, MIN_CHUNK_BITS, MAX_SIZE_BITS> buddy_large; - - /** - * The parent might not support deallocation if this buddy allocator - * covers the whole range. Uses template insanity to make this work. - */ - template - stl::enable_if_t - parent_dealloc_range(capptr::Arena base, size_t size) - { - static_assert( - MAX_SIZE_BITS != (bits::BITS - 1), "Don't set SFINAE parameter"); - parent.dealloc_range(base, size); - } - - void dealloc_overflow(capptr::Arena overflow) - { - if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) - { - if (overflow != nullptr) - { - parent.dealloc_range(overflow, bits::one_at_bit(MAX_SIZE_BITS)); - } - } - else - { - if (overflow != nullptr) - abort(); - } - } - - /** - * Add a range of memory to the address space. - * Divides blocks into power of two sizes with natural alignment - */ - void add_range(capptr::Arena base, size_t length) - { - range_to_pow_2_blocks( - base, length, [this](capptr::Arena base, size_t align, bool) { - auto overflow = - capptr::Arena::unsafe_from(reinterpret_cast( - buddy_large.add_block(base.unsafe_uintptr(), align))); - - dealloc_overflow(overflow); - }); - } - - capptr::Arena refill(size_t size) - { - if (ParentRange::Aligned) - { - // Use amount currently requested to determine refill size. - // This will gradually increase the usage of the parent range. - // So small examples can grow local caches slowly, and larger - // examples will grow them by the refill size. - // - // The heuristic is designed to allocate the following sequence for - // 16KiB requests 16KiB, 16KiB, 32Kib, 64KiB, ..., REFILL_SIZE/2, - // REFILL_SIZE, REFILL_SIZE, ... Hence if this if they are coming from - // a contiguous aligned range, then they could be consolidated. This - // depends on the ParentRange behaviour. - size_t refill_size = bits::min(REFILL_SIZE, requested_total); - refill_size = bits::max(refill_size, MIN_REFILL_SIZE); - refill_size = bits::max(refill_size, size); - refill_size = bits::next_pow2(refill_size); - - auto refill_range = parent.alloc_range(refill_size); - if (refill_range != nullptr) - { - requested_total += refill_size; - add_range(pointer_offset(refill_range, size), refill_size - size); - } - return refill_range; - } - - // Note the unaligned parent path does not use - // requested_total in the heuristic for the initial size - // this is because the request needs to introduce alignment. - // Currently the unaligned variant is not used as a local cache. - // So the gradual growing of refill_size is not needed. - - // Need to overallocate to get the alignment right. - bool overflow = false; - size_t needed_size = bits::umul(size, 2, overflow); - if (overflow) - { - return nullptr; - } - - auto refill_size = bits::max(needed_size, REFILL_SIZE); - while (needed_size <= refill_size) - { - auto refill = parent.alloc_range(refill_size); - - if (refill != nullptr) - { - requested_total += refill_size; - add_range(refill, refill_size); - - SNMALLOC_ASSERT(refill_size < bits::one_at_bit(MAX_SIZE_BITS)); - static_assert( - (REFILL_SIZE < bits::one_at_bit(MAX_SIZE_BITS)) || - ParentRange::Aligned, - "Required to prevent overflow."); - - return alloc_range(size); - } - - refill_size >>= 1; - } - - return nullptr; - } - - public: - static constexpr bool Aligned = true; - - static constexpr bool ConcurrencySafe = false; - - /* The large buddy allocator always deals in Arena-bounded pointers. */ - using ChunkBounds = capptr::bounds::Arena; - static_assert( - stl::is_same_v); - - constexpr Type() = default; - - capptr::Arena alloc_range(size_t size) - { - SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); - SNMALLOC_ASSERT(bits::is_pow2(size)); - - if (size >= bits::mask_bits(MAX_SIZE_BITS)) - { - if (ParentRange::Aligned) - return parent.alloc_range(size); - - return nullptr; - } - - auto result = capptr::Arena::unsafe_from( - reinterpret_cast(buddy_large.remove_block(size))); - - if (result != nullptr) - return result; - - return refill(size); - } - - void dealloc_range(capptr::Arena base, size_t size) - { - SNMALLOC_ASSERT(size >= MIN_CHUNK_SIZE); - SNMALLOC_ASSERT(bits::is_pow2(size)); - - if constexpr (MAX_SIZE_BITS != (bits::BITS - 1)) - { - if (size >= bits::mask_bits(MAX_SIZE_BITS)) - { - parent_dealloc_range(base, size); - return; - } - } - - auto overflow = - capptr::Arena::unsafe_from(reinterpret_cast( - buddy_large.add_block(base.unsafe_uintptr(), size))); - dealloc_overflow(overflow); - } - }; - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/smallbuddyrange.h b/src/snmalloc/backend_helpers/smallbuddyrange.h deleted file mode 100644 index 6f8400e83..000000000 --- a/src/snmalloc/backend_helpers/smallbuddyrange.h +++ /dev/null @@ -1,252 +0,0 @@ -#pragma once - -#include "../pal/pal.h" -#include "empty_range.h" -#include "range_helpers.h" - -namespace snmalloc -{ - /** - * struct for representing the redblack nodes - * directly inside the meta data. - */ - template - struct FreeChunk - { - CapPtr left; - CapPtr right; - }; - - /** - * Class for using the allocations own space to store in the RBTree. - */ - template - class BuddyInplaceRep - { - public: - using Handle = CapPtr, bounds>*; - using Contents = CapPtr, bounds>; - - static constexpr Contents null = nullptr; - static constexpr Contents root = nullptr; - - static constexpr address_t MASK = 1; - - static void set(Handle ptr, Contents r) - { - SNMALLOC_ASSERT((address_cast(r) & MASK) == 0); - if (r == nullptr) - *ptr = CapPtr, bounds>::unsafe_from( - reinterpret_cast*>((*ptr).unsafe_uintptr() & MASK)); - else - // Preserve lower bit. - *ptr = pointer_offset(r, (address_cast(*ptr) & MASK)) - .template as_static>(); - } - - static Contents get(Handle ptr) - { - return pointer_align_down<2, FreeChunk>((*ptr).as_void()); - } - - static Handle ref(bool direction, Contents r) - { - if (direction) - return &r->left; - - return &r->right; - } - - static bool is_red(Contents k) - { - if (k == nullptr) - return false; - return (address_cast(*ref(false, k)) & MASK) == MASK; - } - - static void set_red(Contents k, bool new_is_red) - { - if (new_is_red != is_red(k)) - { - auto r = ref(false, k); - auto old_addr = pointer_align_down<2, FreeChunk>(r->as_void()); - - if (new_is_red) - { - if (old_addr == nullptr) - *r = CapPtr, bounds>::unsafe_from( - reinterpret_cast*>(MASK)); - else - *r = pointer_offset(old_addr, MASK) - .template as_static>(); - } - else - { - *r = old_addr; - } - SNMALLOC_ASSERT(is_red(k) == new_is_red); - } - } - - static Contents offset(Contents k, size_t size) - { - return pointer_offset(k, size).template as_static>(); - } - - static Contents buddy(Contents k, size_t size) - { - // This is just doing xor size, but with what API - // exists on capptr. - auto base = pointer_align_down>(k.as_void(), size * 2); - auto offset = (address_cast(k) & size) ^ size; - return pointer_offset(base, offset) - .template as_static>(); - } - - static Contents align_down(Contents k, size_t size) - { - return pointer_align_down>(k.as_void(), size); - } - - static bool compare(Contents k1, Contents k2) - { - return address_cast(k1) > address_cast(k2); - } - - static bool equal(Contents k1, Contents k2) - { - return address_cast(k1) == address_cast(k2); - } - - static address_t printable(Contents k) - { - return address_cast(k); - } - - /** - * Return the holder in some format suitable for printing by snmalloc's - * debug log mechanism. Used only when used in tracing mode, not normal - * debug or release builds. Raw pointers are printable already, so this is - * the identity function. - */ - static Handle printable(Handle k) - { - return k; - } - - /** - * Return a name for use in tracing mode. Unused in any other context. - */ - static const char* name() - { - return "BuddyInplaceRep"; - } - - static bool can_consolidate(Contents k, size_t size) - { - UNUSED(k, size); - return true; - } - }; - - struct SmallBuddyRange - { - template> - class Type : public ContainsParent - { - public: - using ChunkBounds = typename ParentRange::ChunkBounds; - - private: - using ContainsParent::parent; - - static constexpr size_t MIN_BITS = - bits::next_pow2_bits_const(sizeof(FreeChunk)); - - Buddy, MIN_BITS, MIN_CHUNK_BITS> buddy_small; - - /** - * Add a range of memory to the address space. - * Divides blocks into power of two sizes with natural alignment - */ - void add_range(CapPtr base, size_t length) - { - range_to_pow_2_blocks( - base, - length, - [this](CapPtr base, size_t align, bool) { - if (align < MIN_CHUNK_SIZE) - { - CapPtr overflow = - buddy_small - .add_block( - base.template as_reinterpret>(), - align) - .template as_reinterpret(); - if (overflow != nullptr) - parent.dealloc_range( - overflow, bits::one_at_bit(MIN_CHUNK_BITS)); - } - else - { - parent.dealloc_range(base, align); - } - }); - } - - CapPtr refill(size_t size) - { - auto refill = parent.alloc_range(MIN_CHUNK_SIZE); - - if (refill != nullptr) - add_range(pointer_offset(refill, size), MIN_CHUNK_SIZE - size); - - return refill; - } - - public: - static constexpr bool Aligned = true; - static_assert(ParentRange::Aligned, "ParentRange must be aligned"); - - static constexpr bool ConcurrencySafe = false; - - constexpr Type() = default; - - CapPtr alloc_range(size_t size) - { - if (size >= MIN_CHUNK_SIZE) - return parent.alloc_range(size); - - auto result = buddy_small.remove_block(size); - if (result != nullptr) - { - result->left = nullptr; - result->right = nullptr; - return result.template as_reinterpret(); - } - return refill(size); - } - - CapPtr alloc_range_with_leftover(size_t size) - { - auto rsize = bits::next_pow2(size); - - auto result = alloc_range(rsize); - - if (result == nullptr) - return nullptr; - - auto remnant = pointer_offset(result, size); - - add_range(remnant, rsize - size); - - return result.template as_reinterpret(); - } - - void dealloc_range(CapPtr base, size_t size) - { - add_range(base, size); - } - }; - }; -} // namespace snmalloc diff --git a/src/snmalloc/backend_helpers/staticconditionalrange.h b/src/snmalloc/backend_helpers/staticconditionalrange.h index 682c2f1fb..f5d46441b 100644 --- a/src/snmalloc/backend_helpers/staticconditionalrange.h +++ b/src/snmalloc/backend_helpers/staticconditionalrange.h @@ -10,8 +10,8 @@ namespace snmalloc { // This is a range that can bypass the OptionalRange if it is disabled. // Disabling is global, and not local. - // This is used to allow disabling thread local buddy allocators when the - // initial fixed size heap is small. + // This is used to allow disabling the thread-local cache range when + // the initial fixed-size heap is small. // // The range builds a more complex parent // Pipe diff --git a/src/snmalloc/mitigations/allocconfig.h b/src/snmalloc/mitigations/allocconfig.h index 3f326a570..3626e613a 100644 --- a/src/snmalloc/mitigations/allocconfig.h +++ b/src/snmalloc/mitigations/allocconfig.h @@ -94,9 +94,10 @@ namespace snmalloc #endif ; - // Used to configure when the backend should use thread local buddies. - // This only basically is used to disable some buddy allocators on small - // fixed heap scenarios like OpenEnclave. - static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_BUDDY = + // Used to configure when the backend should use the thread-local + // range cache. Disabled below this heap size for small fixed-heap + // scenarios like OpenEnclave, where the per-thread cache would + // dominate the heap. + static constexpr size_t MIN_HEAP_SIZE_FOR_THREAD_LOCAL_CACHE = bits::one_at_bit(27); } // namespace snmalloc diff --git a/src/test/func/cheri/cheri.cc b/src/test/func/cheri/cheri.cc index 7e2318e11..424a2eae2 100644 --- a/src/test/func/cheri/cheri.cc +++ b/src/test/func/cheri/cheri.cc @@ -58,8 +58,8 @@ int main() } /* - * This large object is sized to end up in our alloc's local buddy allocators - * when it's released. + * This large object is sized to end up in our alloc's thread-local + * cache range when it's released. */ message("Grab large object"); ptraddr_t alarge; From 3941d18f9fbefbbf1d5bb2ca40cfbeae1d7eaccd Mon Sep 17 00:00:00 2001 From: Matthew Parkinson Date: Fri, 12 Jun 2026 16:36:49 +0100 Subject: [PATCH 15/15] Add docs/Arena.md A docs/ companion to AddressSpace.md describing the Arena design: * The two-tree structure (one bin tree per non-empty bin + one range tree for coalescing) and how lookup/serve/insert/coalesce flow through it. * The mechanism for building positive serve masks (matching the in-tree code). * The two Rep variants Arena ships with: PagemapRep behind LargeArenaRange for whole-chunk allocations, InplaceRep behind SmallArenaRange for sub-chunk metadata. Also add PLAN.md to .gitignore so contributors can keep a local planning document without committing it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 1 + docs/Arena.md | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 docs/Arena.md diff --git a/.gitignore b/.gitignore index 122a68c2f..93f844c22 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ # rust target /target +PLAN.md diff --git a/docs/Arena.md b/docs/Arena.md new file mode 100644 index 000000000..287e788d3 --- /dev/null +++ b/docs/Arena.md @@ -0,0 +1,152 @@ +# The Arena: A Bitmap-Indexed Coalescing Range + +`Arena` is snmalloc's address-space range that stores free blocks at their +**natural** size — no power-of-two rounding — and serves any request from the +full snmalloc size-class sequence. It sits in the per-thread range pipeline +underneath the slab caches and replaces the historical buddy-based ranges. + +This document is the conceptual introduction. For where `Arena` plugs into +the wider range chain, see [`AddressSpace.md`](AddressSpace.md). + +## The problem + +A buddy allocator only stores power-of-two blocks. A request for 5 chunks +must be served from an 8-chunk buddy block, wasting 3 chunks. We wanted a +range that + +* stores blocks at their actual size, +* uses snmalloc's full `(exponent, mantissa)` size-class sequence at the + range level, and +* still answers "find a block that can serve this request" in O(1). + +## The core idea: search upward, mask out exceptions + +Free blocks are binned by the *set of size classes they can serve* — the +**servable set**. To allocate, you walk a per-arena non-empty-bins bitmap +upward through the bins; any larger block can be carved down. This almost +works perfectly. The exception is alignment: some bins hold blocks whose +address alignment is too poor to serve certain smaller, *more* aligned size +classes. Those bins must be excluded from the search for those requests. + +The implementation builds the per-request filter *positively* as a **serve +mask** — bit `k` set means bin `k` can serve this request — and the lookup +is `find_first_set(bitmap & serve_mask, start_word)`. The serve mask +depends only on the requested size class, not on the block, so it is +precomputed at compile time. + +(The original sketch of this design used the equivalent inverse framing of +a "skip mask" with `bitmap & ~skip_mask`; see `arenabins.h` for the +in-tree explanation of why positive is preferred.) + +## Why the exceptions exist + +snmalloc's size classes follow `S = 2^e + m · 2^(e−B)`, where `B` is the +mantissa-bit width (`INTERMEDIATE_BITS`, 2 in production). Each size class +has a natural alignment `align(S) = S & -S`. + +A size class with high alignment needs padding to reach an aligned address +within a block. A block of a *larger* size class with *lower* alignment may +not have room for that padding. Concretely: a block of size 5 at address 1 +can serve size 5 (alignment 1) but cannot serve size 4 (alignment 4) — +there is not enough space after padding to the first 4-aligned address. + +Same size block, different address, different servable set. This is why +distinct bins per servable-set are needed. + +## Bin count grows slowly in B + +At each exponent, the distinct servable sets are enumerated exhaustively: + +| B | Mantissas/exponent | Bins/exponent | Max mask bits | +|---|-------------------:|--------------:|--------------:| +| 1 | 2 | 2 | 0 | +| 2 | 4 | 5 | 1 | +| 3 | 8 | 13 | 4 | +| 4 | 16 | 34 | 11 | + +Most requests need no exceptions at all. Only size classes whose alignment +exceeds the expected alignment for their position in the sequence have any +bits to mask. The whole structure is constant-folded into a few small tables. + +## The two-tree structure + +A bitmap alone is not enough — when a bin is non-empty, the arena still has +to *retrieve* and *coalesce* blocks. Each `Arena` therefore maintains: + +* **One red-black tree per non-empty bin** (the "bin trees"), keyed by + block address, giving O(log n) selection within a bin. The non-empty-bins + bitmap is the index over these trees. + +* **One red-black tree of all free blocks** (the "range tree"), keyed by + address, used to find a block's left/right neighbours for coalescing on + free. + +On allocation: bitmap lookup → choose the bin → pop a block from its +bin tree → `carve` returns pre-pad / aligned request / post-pad → pre and +post (if any) re-enter the arena via the bin and range trees. + +On free: range tree lookup → coalesce with neighbours if their tags allow +→ insert the resulting (possibly merged) block. + +## Two variants over the same Arena + +`Arena` is parameterised by a **Rep** (representation) that decides where +the per-block tree-node state lives. Two reps ship today: + +* **`PagemapRep`** — node state lives in the pagemap entry that already + covers the block. Used by **`LargeArenaRange`**, which manages whole + chunks and larger. Node access is a pagemap lookup; no in-band space is + consumed. + +* **`InplaceRep`** — node state lives *in the free block itself*, in the + first units. Used by **`SmallArenaRange`**, which manages sub-chunk + metadata fragments where no pagemap entry exists for the fragment. The + layout packs the bin tree pointers, the range tree pointers, and (for + blocks ≥ 3 units) a large-size word into the leading units of the free + block. Unit size is `next_pow2(2 · sizeof(CapPtr))` — 16 B without + CHERI, 32 B with pure-capability CHERI/Morello — large enough to hold + the two pointers a free block must store. + +Both reps drive the same bin / range tree logic in `arena.h`; the bin +classifier and bitmap in `arenabins.h` are shared. + +## Why this matters for metadata + +Slab metadata typically wants a pow2 client structure (e.g. a 128 B +bitmap) plus a fixed ~32 B header. A buddy-based small range rounds +`160 B → 256 B` (96 B wasted per slab). `SmallArenaRange` rounds to a unit +multiple (`MIN_META_ALIGN`), so the same allocation costs ~160 B. Across +many slabs and large heaps this is real memory. + +## Concrete example (B = 2, in-production) + +At exponent `e = 2` the size classes are 4, 5, 6, 7, and there are 5 bins, +each labeled by the set of sizes it can serve at this exponent: + + Bin 0: serves {4} + Bin 1: serves {5} + Bin 2: serves {4, 5} + Bin 3: serves {4, 5, 6} + Bin 4: serves {4, 5, 6, 7} + +The per-request serve masks (within this exponent — higher exponents +always serve, so their bits are set): + + Request for 7: serve bins {4} + Request for 6: serve bins {3, 4} + Request for 5: serve bins {1, 2, 3, 4} + Request for 4: serve bins {0, 2, 3, 4} — bin 1 holds only {5} blocks + +Only the size-4 request has an exception: bin 1 must not be picked. All +other requests get the simple "everything at or above" mask. + +## Where to look in the code + +* `src/snmalloc/backend_helpers/arenabins.h` — bin classification, serve + masks, the non-empty-bins bitmap, the `carve` primitive. +* `src/snmalloc/backend_helpers/arena.h` — bin-tree-per-bin + range-tree + structure, allocation and free / coalesce paths. +* `src/snmalloc/backend_helpers/largearenarange.h` — `Arena` + for whole-chunk allocations. +* `src/snmalloc/backend_helpers/smallarenarange.h`, + `inplacerep.h` — `Arena` for sub-chunk metadata.