diff --git a/docs/AGGRESSIVE_MACRO_ELIMINATION_PLAN.md b/docs/AGGRESSIVE_MACRO_ELIMINATION_PLAN.md deleted file mode 100644 index b3b8b554..00000000 --- a/docs/AGGRESSIVE_MACRO_ELIMINATION_PLAN.md +++ /dev/null @@ -1,652 +0,0 @@ -# ⚠️ PARTIALLY COMPLETE - Aggressive Macro Elimination Plan - -**Status**: ⚠️ **ONGOING** - lvm.cpp macros mostly done, ~75 remain in other files -**Last Updated**: November 2025 -**Remaining**: lopcodes.h, llimits.h, lctype.h macros - ---- - -# AGGRESSIVE MACRO ELIMINATION PLAN - lvm.cpp - -**Date:** 2025-11-17 -**Goal:** Convert ALL 36 remaining macros to modern C++ -**Timeline:** 8-12 hours total -**Risk Level:** MEDIUM to HIGH (performance-critical code) - ---- - -## Executive Summary - -After converting 11 VM operation macros to lambdas (with 2.1% performance GAIN), we're targeting the remaining 36 macros for elimination. This aggressive plan converts everything except true compile-time configuration. - -**Target:** Convert 33/36 macros (92% conversion rate) -**Keep:** 3 configuration macros only - ---- - -## Phase-by-Phase Plan - -### 🟢 Phase 2.1: CLEANUP - Remove Dead Code (HIGH PRIORITY) - -**Target:** 11 macros (lines 991-1127) -**Effort:** 15 minutes -**Risk:** ZERO (dead code removal) -**Performance Impact:** None (code not used) - -**Action:** Delete original VM operation macro definitions - -```cpp -// DELETE these (superseded by lambdas): -#define op_arithI(L,iop,fop) { ... } // Line 991-1003 -#define op_arithf_aux(L,v1,v2,fop) { ... } // Line 1010-1015 -#define op_arithf(L,fop) { ... } // Line 1021-1024 -#define op_arithfK(L,fop) { ... } // Line 1030-1033 -#define op_arith_aux(L,v1,v2,iop,fop) { ... } // Line 1039-1045 -#define op_arith(L,iop,fop) { ... } // Line 1051-1054 -#define op_arithK(L,iop,fop) { ... } // Line 1060-1063 -#define op_bitwiseK(L,op) { ... } // Line 1069-1074 -#define op_bitwise(L,op) { ... } // Line 1083-1090 -#define op_order(L,op,other) { ... } // Line 1097-1105 -#define op_orderI(L,opi,opf,inv,tm) { ... } // Line 1112-1127 -``` - -**Reason:** These are #undef'd inside luaV_execute and replaced by lambdas. They serve no purpose. - -**Benchmark:** Not required (no code changes, just deletion) - ---- - -### 🟢 Phase 2.2: Math Constants → Constexpr (LOW RISK) - -**Target:** 4 macros -**Effort:** 30 minutes -**Risk:** LOW (compile-time constants) -**Performance Impact:** None expected - -**Conversions:** - -```cpp -// BEFORE (Line 69): -#define NBM l_floatatt(MANT_DIG) - -// AFTER: -inline constexpr int NBM = l_floatatt(MANT_DIG); - -// BEFORE (Line 82): -#define MAXINTFITSF ((lua_Unsigned)1 << NBM) - -// AFTER: -inline constexpr lua_Unsigned MAXINTFITSF = (static_cast(1) << NBM); - -// BEFORE (Line 85 or 89): -#define l_intfitsf(i) ((MAXINTFITSF + l_castS2U(i)) <= (2 * MAXINTFITSF)) -// or: -#define l_intfitsf(i) 1 - -// AFTER: -inline constexpr bool l_intfitsf(lua_Integer i) noexcept { -#if !defined(LUA_FLOORN2I) - return ((MAXINTFITSF + l_castS2U(i)) <= (2 * MAXINTFITSF)); -#else - (void)i; - return true; -#endif -} - -// BEFORE (Line 832): -#define NBITS l_numbits(lua_Integer) - -// AFTER: -inline constexpr int NBITS = l_numbits(lua_Integer); -``` - -**Benchmark:** Quick 3-run test (expect identical performance) - -**Dependencies:** None (standalone constants) - ---- - -### 🟡 Phase 2.3: String Conversion → Inline Function (MEDIUM RISK) - -**Target:** 1 macro (line 680) -**Effort:** 1 hour -**Risk:** MEDIUM (used in string operations) -**Performance Impact:** Minimal expected - -**Current macro:** -```cpp -#define tostring(L,o) \ - (ttisstring(o) || (cvt2str(o) && (luaO_tostring(L, o), 1))) -``` - -**Conversion strategy:** - -```cpp -// Option A: Inline function with short-circuit evaluation -inline bool tostring(lua_State* L, TValue* o) { - if (ttisstring(o)) return true; - if (!cvt2str(o)) return false; - luaO_tostring(L, o); - return true; -} - -// Option B: Keep comma operator for exact semantics -inline bool tostring(lua_State* L, TValue* o) { - return ttisstring(o) || (cvt2str(o) && (luaO_tostring(L, o), true)); -} -``` - -**Analysis needed:** Check all call sites to ensure proper usage - -**Benchmark:** 5-run test (string-heavy workload) - ---- - -### 🟡 Phase 2.4: Register Access → Inline Functions (HIGHER RISK) - -**Target:** 9 macros (lines 1185-1193) -**Effort:** 2-3 hours -**Risk:** MEDIUM-HIGH (ultra-hot path, billions of executions) -**Performance Impact:** Critical to verify - -**Current macros:** -```cpp -#define RA(i) (base+InstructionView(i).a()) -#define vRA(i) s2v(RA(i)) -#define RB(i) (base+InstructionView(i).b()) -#define vRB(i) s2v(RB(i)) -#define KB(i) (k+InstructionView(i).b()) -#define RC(i) (base+InstructionView(i).c()) -#define vRC(i) s2v(RC(i)) -#define KC(i) (k+InstructionView(i).c()) -#define RKC(i) ((InstructionView(i).testk()) ? k + InstructionView(i).c() : s2v(base + InstructionView(i).c())) -``` - -**Conversion strategy - Option A: Lambda Capture (RECOMMENDED)** - -These need access to `base` and `k` from luaV_execute scope. Convert to lambdas like we did with operations: - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... existing setup ... - - // Register access lambdas (after base and k are initialized) - auto RA = [&](Instruction i) -> StkId { - return base + InstructionView(i).a(); - }; - auto vRA = [&](Instruction i) -> TValue* { - return s2v(RA(i)); - }; - auto RB = [&](Instruction i) -> StkId { - return base + InstructionView(i).b(); - }; - auto vRB = [&](Instruction i) -> TValue* { - return s2v(RB(i)); - }; - auto KB = [&](Instruction i) -> const TValue* { - return k + InstructionView(i).b(); - }; - auto RC = [&](Instruction i) -> StkId { - return base + InstructionView(i).c(); - }; - auto vRC = [&](Instruction i) -> TValue* { - return s2v(RC(i)); - }; - auto KC = [&](Instruction i) -> const TValue* { - return k + InstructionView(i).c(); - }; - auto RKC = [&](Instruction i) -> const TValue* { - return InstructionView(i).testk() - ? (k + InstructionView(i).c()) - : s2v(base + InstructionView(i).c()); - }; - - // ... existing operation lambdas ... - // ... main loop ... -} -``` - -**Why lambdas work:** -1. ✅ Same performance as macros (proven with op_arith* lambdas) -2. ✅ Type safety (return types explicit) -3. ✅ Automatic capture of base and k -4. ✅ Perfect inlining at -O3 -5. ✅ No need to modify call sites (same syntax) - -**Alternative - Option B: Pass base/k explicitly (NOT RECOMMENDED)** - -Would require passing base and k to every lambda and modifying all call sites. Too invasive. - -**Benchmark:** CRITICAL - 10-run side-by-side test -- Register access is executed billions of times -- Must verify zero performance regression -- If any regression > 1%, revert immediately - ---- - -### 🟠 Phase 2.5: VM State Management → Inline Functions (MEDIUM RISK) - -**Target:** 5 macros (lines 1197-1244) -**Effort:** 1.5 hours -**Risk:** MEDIUM (frequently used in error paths) -**Performance Impact:** Moderate concern - -**Current macros:** -```cpp -#define updatetrap(ci) (trap = ci->getTrap()) -#define updatebase(ci) (base = ci->funcRef().p + 1) -#define updatestack(ci) { if (l_unlikely(trap)) { updatebase(ci); ra = RA(i); } } -#define savepc(ci) ci->setSavedPC(pc) -#define savestate(L,ci) (savepc(ci), L->getTop().p = ci->topRef().p) -``` - -**Conversion strategy:** - -These access outer scope variables (trap, base, pc, ra) - use lambdas: - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... existing setup ... - - auto updatetrap = [&]() { trap = ci->getTrap(); }; - auto updatebase = [&]() { base = ci->funcRef().p + 1; }; - auto updatestack = [&]() { - if (l_unlikely(trap)) { - updatebase(); - ra = RA(i); - } - }; - auto savepc = [&]() { ci->setSavedPC(pc); }; - auto savestate = [&]() { - savepc(); - L->getTop().p = ci->topRef().p; - }; - - // ... main loop ... -} -``` - -**Note:** These lambdas capture `trap`, `base`, `pc`, `ra`, `ci`, `L`, `i` by reference - -**Benchmark:** 5-run test (focus on error handling paths) - ---- - -### 🔴 Phase 2.6: Control Flow → Lambdas (HIGHER RISK) - -**Target:** 3 macros (lines 1210-1221) -**Effort:** 1 hour -**Risk:** MEDIUM-HIGH (used in every branch/jump) -**Performance Impact:** Must verify carefully - -**Current macros:** -```cpp -#define dojump(ci,i,e) { pc += InstructionView(i).sj() + e; updatetrap(ci); } -#define donextjump(ci) { Instruction ni = *pc; dojump(ci, ni, 1); } -#define docondjump() if (cond != InstructionView(i).k()) pc++; else donextjump(ci); -``` - -**Conversion strategy:** - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... existing setup ... - - auto dojump = [&](Instruction instr, int e) { - pc += InstructionView(instr).sj() + e; - updatetrap(); - }; - - auto donextjump = [&]() { - Instruction ni = *pc; - dojump(ni, 1); - }; - - auto docondjump = [&](int cond) { - if (cond != InstructionView(i).k()) - pc++; - else - donextjump(); - }; - - // ... main loop ... -} -``` - -**Call site changes needed:** -```cpp -// OLD: -docondjump(); // Uses implicit 'cond' from outer scope - -// NEW: -docondjump(cond); // Pass cond explicitly -``` - -**Note:** `docondjump` is used inside `op_order` and `op_orderI` lambdas! Must update those too. - -**Benchmark:** 10-run side-by-side test (branching is critical path) - ---- - -### 🔴 Phase 2.7: Exception/Error Handling → Lambdas (CRITICAL) - -**Target:** 4 macros (lines 1263-1300) -**Effort:** 2 hours -**Risk:** HIGH (exception safety critical) -**Performance Impact:** Error paths - moderate concern - -**Current macros:** -```cpp -#define Protect(exp) (savestate(L,ci), (exp), updatetrap(ci)) -#define ProtectNT(exp) (savepc(ci), (exp), updatetrap(ci)) -#define halfProtect(exp) (savestate(L,ci), (exp)) -#define checkGC(L,c) { luaC_condGC(L, (savepc(ci), L->getTop().p = (c)), updatetrap(ci)); luai_threadyield(L); } -``` - -**Challenge:** These are expression-like macros that wrap arbitrary code - -**Conversion strategy - Template Lambdas:** - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... existing setup ... - - // Protect: Save state, execute, update trap - auto Protect = [&](auto&& expr) { - savestate(); - expr(); - updatetrap(); - }; - - // ProtectNT: Save PC only, execute, update trap - auto ProtectNT = [&](auto&& expr) { - savepc(); - expr(); - updatetrap(); - }; - - // halfProtect: Save state, execute (no trap update) - auto halfProtect = [&](auto&& expr) { - savestate(); - expr(); - }; - - // checkGC: Conditional GC with state save - auto checkGC = [&](StkId limit) { - luaC_condGC(L, [&]() { - savepc(); - L->getTop().p = limit; - }, [&]() { - updatetrap(); - }); - luai_threadyield(L); - }; -} -``` - -**Call site changes:** -```cpp -// OLD: -Protect(cond = other(L, ra, rb)); - -// NEW: -Protect([&]() { cond = other(L, ra, rb); }); - -// OR (if expression result needed): -cond = Protect([&]() { return other(L, ra, rb); }); -``` - -**Major refactoring:** All Protect/halfProtect call sites must be updated (40+ locations) - -**Benchmark:** EXTENSIVE - 10-run test (error handling critical) - ---- - -### 🔴 Phase 2.8: VM Dispatch → Keep or Replace (HIGHEST RISK) - -**Target:** 4 macros (lines 1282-1336) -**Effort:** 3-4 hours (if attempted) -**Risk:** VERY HIGH (core VM dispatch) -**Performance Impact:** CRITICAL - -**Current macros:** -```cpp -#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);} -#define vmfetch() { if (l_unlikely(trap)) { trap = luaG_traceexec(L, pc); updatebase(ci); } i = *(pc++); } -#define vmdispatch(o) switch(o) -#define vmcase(l) case l: -#define vmbreak break -``` - -**Analysis:** - -**vmfetch:** -```cpp -// Could become lambda: -auto vmfetch = [&]() { - if (l_unlikely(trap)) { - trap = luaG_traceexec(L, pc); - updatebase(); - } - i = *(pc++); -}; -``` - -**vmdispatch/vmcase/vmbreak:** - -These define the dispatch mechanism. Options: - -**Option A: Keep as macros (RECOMMENDED)** -- These are fundamental VM structure -- No benefit from conversion -- Risk too high for minimal gain - -**Option B: Remove macros, use direct syntax** -```cpp -// OLD: -vmdispatch (InstructionView(i).opcode()) { - vmcase(OP_MOVE) { - // ... - vmbreak; - } -} - -// NEW: -switch (InstructionView(i).opcode()) { - case OP_MOVE: { - // ... - break; - } -} -``` - -This is trivial but changes 200+ lines of code for no real benefit. - -**luai_threadyield:** - -```cpp -// Could become: -inline void luai_threadyield(lua_State* L) { - lua_unlock(L); - lua_lock(L); -} -``` - -**Recommendation:** -- Convert `luai_threadyield` → inline function (5 min, ZERO risk) -- Convert `vmfetch` → lambda (30 min, LOW risk) -- **KEEP** vmdispatch/vmcase/vmbreak (no benefit, high effort) - -**Benchmark:** If vmfetch converted, 10-run test - ---- - -## Summary Table - -| Phase | Target | Macros | Effort | Risk | Benchmark | -|-------|--------|--------|--------|------|-----------| -| 2.1 | Dead code removal | 11 | 15 min | ZERO | No | -| 2.2 | Math constants | 4 | 30 min | LOW | Quick | -| 2.3 | String conversion | 1 | 1 hr | MED | 5-run | -| 2.4 | Register access | 9 | 2-3 hr | MED-HIGH | 10-run | -| 2.5 | State management | 5 | 1.5 hr | MED | 5-run | -| 2.6 | Control flow | 3 | 1 hr | MED-HIGH | 10-run | -| 2.7 | Exception handling | 4 | 2 hr | HIGH | 10-run | -| 2.8 | VM dispatch | 2 | 30 min | MED | 10-run | -| **TOTAL** | **Convertible** | **33** | **8-12 hr** | **MIXED** | **Required** | - -**Keep as-is:** -- Configuration macros (3): `lvm_c`, `LUA_CORE`, `LUA_USE_JUMPTABLE` -- Dispatch macros (2): `vmdispatch`, `vmcase`, `vmbreak` (optional - could remove but low value) - ---- - -## Execution Strategy - -### Aggressive Approach (RECOMMENDED) - -Execute phases in order, with mandatory benchmarking between phases: - -1. ✅ **Phase 2.1** - Immediate (dead code cleanup) -2. ✅ **Phase 2.2** - Low risk (math constants) -3. ⚠️ **Phase 2.3** - Medium risk (string conversion) -4. 🛑 **CHECKPOINT** - Benchmark, verify zero regression -5. ⚠️ **Phase 2.4** - HIGH RISK (register access) -6. 🛑 **CRITICAL CHECKPOINT** - Extensive benchmark -7. ⚠️ **Phase 2.5** - Medium risk (state management) -8. ⚠️ **Phase 2.6** - Medium risk (control flow) -9. 🛑 **CHECKPOINT** - Benchmark -10. ⚠️ **Phase 2.7** - HIGH RISK (exception handling) -11. 🛑 **FINAL CHECKPOINT** - Comprehensive benchmark -12. ⚠️ **Phase 2.8** - Optional (dispatch macros) - -### Rollback Strategy - -**At each checkpoint:** -1. Run side-by-side benchmark (macro vs lambda) -2. If regression > 1%, **IMMEDIATELY REVERT** -3. Document why conversion failed -4. Keep previous phase changes - -### Success Criteria - -**Must achieve:** -- ✅ Zero compiler warnings -- ✅ All tests pass -- ✅ Performance within 1% of macro version -- ✅ No increase in variance - -**Nice to have:** -- ✅ Performance improvement (like we got with op_arith* lambdas) -- ✅ Lower variance -- ✅ Cleaner code - ---- - -## Expected Outcomes - -### Best Case (60% probability) - -- ✅ All 33 macros converted -- ✅ Performance neutral or slightly better -- ✅ Dramatically improved code quality -- ✅ Complete modernization of lvm.cpp - -### Likely Case (30% probability) - -- ⚠️ 25-30 macros converted (76-91%) -- ⚠️ Register access or exception handling causes issues -- ⚠️ Some macros kept for performance -- ✅ Still significant improvement - -### Worst Case (10% probability) - -- ❌ Critical performance regression in Phase 2.4 or 2.7 -- ❌ Only 15-20 macros converted (45-61%) -- ❌ Must keep hot-path macros -- ⚠️ Still better than original macro version - ---- - -## Risk Mitigation - -### High-Risk Phases (2.4, 2.7) - -1. **Prototype first** - Test with minimal changes -2. **Incremental conversion** - Convert 1-2 macros at a time -3. **Extensive benchmarking** - 20+ runs if needed -4. **Assembly analysis** - Check generated code -5. **Profiling** - Use perf to identify hot spots - -### Performance Validation - -Each phase must pass: -- ✅ Side-by-side benchmark (interleaved execution) -- ✅ Statistical significance (10+ runs) -- ✅ Variance check (similar or lower) -- ✅ Test suite pass (all tests) - -### Code Review Points - -Before each phase: -- ✅ Verify capture semantics correct -- ✅ Check for unintended copies -- ✅ Confirm noexcept where appropriate -- ✅ Review generated assembly (for critical paths) - ---- - -## Tools and Techniques - -### Benchmarking Script - -Reuse `testes/bench_compare.sh` for side-by-side comparisons - -### Assembly Analysis - -```bash -# Generate assembly for hot functions -g++ -S -O3 -std=c++23 -o lvm_macro.s src/vm/lvm.cpp -# Compare with lambda version -diff -u lvm_macro.s lvm_lambda.s -``` - -### Profiling - -```bash -# Profile hot paths -perf record -g ./lua all.lua -perf report -``` - -### Static Analysis - -```bash -# Check for unintended copies -clang-tidy --checks='performance-*' src/vm/lvm.cpp -``` - ---- - -## Conclusion - -This aggressive plan targets **33/36 macros (92%)** for conversion, leaving only true configuration macros. - -**Key insights from op_arith* lambda conversion:** -1. ✅ Modern compilers optimize lambdas excellently -2. ✅ Type safety helps (not hurts) performance -3. ✅ Side-by-side benchmarking is CRITICAL -4. ✅ Zero-cost abstractions are REAL - -**Timeline:** 8-12 hours with proper benchmarking -**Expected result:** 25-33 macros converted (76-100%) -**Performance target:** Within ±1% of macro version - -**This would make lvm.cpp one of the most modern VM implementations in any language runtime!** - ---- - -**Plan created by:** Claude (AI Assistant) -**Date:** 2025-11-17 -**Branch:** claude/analyze-lv-018LEz1SVgM57AT2HW11UTsi -**Status:** Ready for execution diff --git a/docs/CONSTRUCTOR_PLAN.md b/docs/CONSTRUCTOR_PLAN.md deleted file mode 100644 index 6eb72441..00000000 --- a/docs/CONSTRUCTOR_PLAN.md +++ /dev/null @@ -1,200 +0,0 @@ -# ✅ HISTORICAL - Constructor Pattern Plan (COMPLETED) - -**Status**: ✅ **COMPLETE** - All GC objects use constructor pattern -**Completion Date**: November 2025 -**Result**: Constructor pattern with placement new operators fully implemented - ---- - -# Constructor Pattern Plan - GC Object Allocation - -## Status: ✅ COMPLETED - -All GC object types now use constructor pattern with placement new operators. Full test suite passes. - -## Overview -Convert `luaF_*` and `luaS_*` allocation functions to constructor pattern with placement new operators for type-safe GC allocation. - -## Pattern - -```cpp -class CClosure : public GCBase { -public: - // Member placement new operator for GC allocation - static void* operator new(size_t size, lua_State* L, lu_byte tt) { - return luaC_newobj(L, tt, size); - } - - // For variable-size allocation (optional) - static void* operator new(size_t size, lua_State* L, lu_byte tt, size_t extra) { - return luaC_newobj(L, tt, size + extra); - } - - // Constructor - CClosure(int nupvals); - - // Factory method - static CClosure* create(lua_State* L, int nupvals) { - size_t extra = (nupvals - 1) * sizeof(TValue); - CClosure* c = new (L, LUA_VCCL, extra) CClosure(nupvals); - return c; - } -}; - -// Old function becomes wrapper for compatibility -CClosure* luaF_newCclosure(lua_State* L, int nupvals) { - return CClosure::create(L, nupvals); -} -``` - -## Classes to Convert - -### Phase 34a: CClosure -```cpp -CClosure::CClosure(int nupvals) { - this->nupvalues = cast_byte(nupvals); - this->gclist = NULL; - this->f = NULL; -} -``` - -### Phase 34b: LClosure -```cpp -LClosure::LClosure(int nupvals) { - this->nupvalues = cast_byte(nupvals); - this->p = NULL; - this->gclist = NULL; -} -``` - -### Phase 34c: Proto -```cpp -Proto::Proto() { - // Initialize all 14+ fields to NULL/0 -} - -Proto* Proto::create(lua_State* L) { - return new (L, LUA_VPROTO) Proto(); -} -``` - -### Phase 34d: Udata -```cpp -Udata::Udata(size_t len, unsigned short nuvalue) { - this->len = len; - this->nuvalue = nuvalue; - this->metatable = NULL; -} - -Udata* Udata::create(lua_State* L, size_t s, unsigned short nuvalue) { - size_t extra = s + nuvalue * sizeof(TValue); - return new (L, LUA_VUSERDATA, extra) Udata(s, nuvalue); -} -``` - -### Phase 34e: TString -```cpp -// Short string -TString::TString(unsigned int hash, ls_byte shrlen) { - this->hash = hash; - this->shrlen = shrlen; - this->extra = 0; -} - -// Long string -TString::TString(unsigned int hash, size_t lnglen, ls_byte kind) { - this->hash = hash; - this->shrlen = -1; - this->u.lnglen = lnglen; - this->extra = kind; -} -``` - -### Phase 34f: UpVal -```cpp -UpVal::UpVal(TValue* val) { - this->v = val; -} - -UpVal* UpVal::create(lua_State* L, StkId level) { - UpVal* uv = new (L, LUA_VUPVAL) UpVal(level); - uv->v = level; // Open upvalue points to stack - return uv; -} -``` - -## Benefits -- ✅ Type-safe GC allocation via `new (L, type) Class(args)` -- ✅ lua_State dependency explicit in operator signature -- ✅ RAII - Constructors guarantee initialization -- ✅ Encapsulation - GC allocation logic in class -- ✅ Zero runtime cost - Placement new has no overhead - -## Testing -- ✅ Build after each phase -- ✅ Run full test suite (all.lua) - **PASSED** -- ✅ Benchmark (must stay ≤2.21s) - **2.12s** -- Commit each phase separately - -## Implementation Notes - Variable-Size Objects - -### Critical Issue: Field Initialization for Variable-Size Types - -Both TString and Udata use variable-size allocation where `allocated_size < sizeof(Class)` for certain variants. This creates a critical constraint: **you cannot initialize fields that don't exist in the allocated memory**. - -### TString Variants and Memory Layout - -```cpp -// TString has 3 variants with different memory layouts: - -1. Short strings (LUA_VSHRSTR): - Size = contentsOffset() + length + 1 - Fields: GCObject, extra, shrlen, hash, u - String data inline at offset 32 (where contents field would be) - -2. LSTRFIX (fixed external): - Size = 40 bytes (fallocOffset()) - Fields: GCObject, extra, shrlen, hash, u, contents - NO falloc or ud fields! - -3. LSTRMEM (managed external): - Size = sizeof(TString) = 56 bytes - Fields: GCObject, extra, shrlen, hash, u, contents, falloc, ud - All fields exist -``` - -**Bug Fix**: Original code initialized falloc and ud for ALL long strings, but LSTRFIX only allocates 40 bytes. Writing to falloc (offset 40) wrote to guard bytes! Solution: Only initialize contents for long strings; falloc/ud are initialized by caller when needed. - -### Udata Variants and Memory Layout - -```cpp -// Udata has 2 variants: - -1. Udata0 (nuvalue == 0): - Size = offsetof(Udata0, bindata) + data_length - Fields: GCObject, nuvalue, len, metatable, bindata - NO gclist field! - -2. Udata (nuvalue > 0): - Size = uvOffset() + (sizeof(UValue) * nuvalue) + data_length - Fields: GCObject, nuvalue, len, metatable, gclist, uv[] - gclist field exists at offset 32 -``` - -**Bug Fix**: Original code initialized gclist for ALL Udata, but Udata0 doesn't have this field. For Udata0 with len=0, size=32, writing to gclist (offset 32) wrote to guard bytes! Solution: Only initialize gclist when `nuvalue > 0`. - -### General Rule for Variable-Size Objects - -**Do not use constructors** for variable-size objects. Instead: -1. Call `luaC_newobj` directly with exact size -2. Manually initialize only the fields that exist in allocated memory -3. Use conditional initialization based on variant type -4. Let callers initialize optional fields when needed - -### Memory Corruption Debugging Tips - -1. Test allocator guard bytes are invaluable for detecting buffer overruns -2. Variable-size bugs manifest as writing to addresses beyond allocation -3. Pattern: Last N bytes of allocation + first M guard bytes zeroed -4. Check struct layouts with offsetof() to understand field positions -5. For 8-byte pointer fields, corruption often shows as 8 consecutive zeros diff --git a/docs/CONSTRUCTOR_REFACTOR_PLAN.md b/docs/CONSTRUCTOR_REFACTOR_PLAN.md deleted file mode 100644 index 3a12b144..00000000 --- a/docs/CONSTRUCTOR_REFACTOR_PLAN.md +++ /dev/null @@ -1,944 +0,0 @@ -# Constructor Refactoring Plan - Lua C++ Project - -**Created**: 2025-11-15 -**Status**: Planning Phase -**Goal**: Move object initialization code from factory functions into proper C++ constructors -**Performance Target**: ≤2.21s (≤1% regression from 2.17s baseline) - ---- - -## Executive Summary - -This plan addresses the inconsistent object initialization patterns across the codebase. Currently, the 19 main classes use a mix of patterns ranging from comprehensive constructors (Proto) to dangerous manual initialization (CallInfo with incomplete field initialization). This refactoring will: - -1. ✅ **Improve safety** - Eliminate uninitialized fields (CallInfo bug fix) -2. ✅ **Enhance maintainability** - Centralize initialization logic in constructors -3. ✅ **Standardize patterns** - Consistent factory methods across all classes -4. ✅ **Maintain performance** - Zero-cost abstractions with inline constructors -5. ✅ **Preserve compatibility** - C API unchanged - ---- - -## Current State Assessment - -### Classes by Initialization Quality - -| Priority | Class | Lines of Init | Pattern | Risk Level | -|----------|-------|---------------|---------|------------| -| 🔴 **P0** | CallInfo | ~4 | Manual (INCOMPLETE!) | **CRITICAL** | -| 🔴 **P0** | lua_State | ~50+ | Manual multi-phase | **HIGH** | -| 🔴 **P0** | global_State | ~50+ | Manual multi-phase | **HIGH** | -| 🟡 **P1** | Udata | ~8 | Has constructor, NOT used | MEDIUM | -| 🟡 **P1** | TString | ~10-15 | Manual (variable-size) | MEDIUM | -| 🟡 **P1** | Table | ~5 | Constructor + manual setup | MEDIUM | -| 🟡 **P1** | LClosure | ~8 | Constructor + `initUpvals()` | MEDIUM | -| 🟢 **P2** | stringtable | ~3 | Manual setters | LOW | -| 🟢 **P2** | Upvaldesc | ~4 | Manual by parser | LOW | -| 🟢 **P2** | LocVar | ~3 | Manual by parser | LOW | -| 🟢 **P2** | AbsLineInfo | ~2 | Manual by parser | LOW | -| ✅ **OK** | Proto | 0 | ✅ **Comprehensive constructor** | None | -| ✅ **OK** | UpVal | 0 | ✅ Constructor (minimal) | None | -| ✅ **OK** | CClosure | 0 | ✅ Constructor + factory | None | - -### Critical Issues Identified - -1. **CallInfo Incomplete Initialization** 🔴 - - Only 4/9 fields initialized in `luaE_extendCI` - - Missing: `func`, `top`, `u` unions, `u2` union, `callstatus` - - **BUG RISK**: Undefined behavior potential - -2. **lua_State Manual Initialization** 🔴 - - 50+ lines of manual field setting in `preinit_thread()` - - Easy to miss fields during maintenance - - Error-prone for new contributors - -3. **global_State Manual Initialization** 🔴 - - 50+ lines in `lua_newstate()` - - Complex initialization spread across multiple functions - - Hard to verify completeness - -4. **Udata Constructor Not Used** 🟡 - - Has `Udata() noexcept` constructor - - Factory function `luaS_newudata` doesn't call it - - Wasted effort, confusing code - ---- - -## Goals and Constraints - -### Primary Goals - -1. **Safety First** - All fields initialized to safe defaults -2. **Single Point of Truth** - Initialization logic in ONE place (constructor) -3. **Consistency** - All classes follow same pattern -4. **Maintainability** - Easy to verify completeness - -### Constraints - -1. **Zero Performance Regression** - Target ≤2.21s (≤1% from 2.17s baseline) -2. **C API Compatibility** - Public API unchanged -3. **GC Integration** - Placement new operators must work -4. **Variable-Size Objects** - Handle special cases (TString, Closures, Udata) -5. **Incremental Changes** - Test after every phase - -### Non-Goals - -- ❌ Not adding RAII/destructors (GC handles memory) -- ❌ Not changing allocation strategy (placement new is good) -- ❌ Not modifying GC behavior - ---- - -## Design Patterns - -### Pattern A: Fixed-Size Class with Comprehensive Constructor - -**Use for**: Proto, CallInfo, UpVal, stringtable, Upvaldesc, LocVar, AbsLineInfo - -```cpp -class ClassName : public GCBase { -private: - Type field1; - Type field2; - // ... all fields - -public: - // Inline constructor - zero-cost with optimization - ClassName() noexcept { - field1 = safe_default; - field2 = safe_default; - // Initialize EVERY field - } - - // Static factory method (optional, for consistency) - static ClassName* create(lua_State* L) { - return new (L, TYPE_TAG) ClassName(); - } -}; - -// C API wrapper -inline ClassName* luaX_newClassName(lua_State* L) { - return ClassName::create(L); -} -``` - -**Benefits**: -- ✅ All fields initialized in one place -- ✅ Inline constructor = zero-cost -- ✅ Easy to verify completeness -- ✅ Type-safe defaults - ---- - -### Pattern B: Variable-Size Class with Two-Phase Init - -**Use for**: CClosure, LClosure, TString, Udata - -```cpp -class VarSize : public GCBase { -private: - int count; // Fixed field - Type fixed_field; // Fixed field - Type array[1]; // Variable-size array (flexible array member) - -public: - // Constructor: Initialize ONLY fixed-size fields - explicit VarSize(int n) noexcept : count(n) { - fixed_field = default_value; - // DON'T touch array[] - may not be fully allocated yet! - } - - // Factory method: Handle variable-size allocation + array init - static VarSize* create(lua_State* L, int n) { - // Calculate extra space needed - size_t extra = (n > 1) ? (n - 1) * sizeof(Type) : 0; - - // Allocate with extra space, call constructor - VarSize* obj = new (L, TYPE_TAG, extra) VarSize(n); - - // Initialize variable array AFTER allocation - for (int i = 0; i < n; i++) { - obj->array[i] = default_value; - } - - return obj; - } -}; -``` - -**Benefits**: -- ✅ Constructor initializes what it can safely touch -- ✅ Factory handles variable-size complexities -- ✅ Clear separation of concerns - -**Critical Rule**: Constructor must NOT access memory beyond the base class size unless guaranteed to be allocated. - ---- - -### Pattern C: Complex Multi-Phase Initialization - -**Use for**: lua_State, global_State, Table - -```cpp -class Complex : public GCBase { -private: - // Many fields... - -public: - // Phase 1: Constructor sets safe defaults for ALL fields - Complex() noexcept { - // Initialize every field to safe default - // Even if it will be overwritten later - } - - // Phase 2: Post-construction setup (requires lua_State* or allocation) - void initialize(lua_State* L, params...) { - // Operations requiring allocation or lua_State - // Object is already in safe state from constructor - } - - // Factory method orchestrates both phases - static Complex* create(lua_State* L, params...) { - Complex* obj = new (L, TYPE_TAG) Complex(); // Safe defaults - obj->initialize(L, params); // Complete setup - return obj; - } -}; -``` - -**Benefits**: -- ✅ Object always in valid state after constructor -- ✅ Can separate allocation-free init from allocation-heavy init -- ✅ Safe even if initialization fails partway through - ---- - -## Phased Implementation Plan - -### Phase 1: Critical Safety Fixes (P0) 🔴 - -**Estimated Time**: 8-12 hours -**Risk**: Low (fixing bugs) -**Performance Impact**: None (likely slight improvement) - -#### 1.1 - Fix CallInfo Incomplete Initialization - -**Current Problem**: -```cpp -CallInfo *luaE_extendCI (lua_State *L) { - ci = luaM_new(L, CallInfo); // NO initialization! - ci->setPrevious(L->getCI()); - ci->setNext(NULL); - ci->getTrap() = 0; - // Missing: func, top, u unions, u2 union, callstatus ❌ - return ci; -} -``` - -**Solution**: -```cpp -// Add to CallInfo class (lstate.h) -class CallInfo { -public: - CallInfo() noexcept { - func.p = nullptr; - top.p = nullptr; - previous = nullptr; - next = nullptr; - - // Initialize u union as Lua function (safest default) - u.l.savedpc = nullptr; - u.l.trap = 0; - u.l.nextraargs = 0; - - // Initialize u2 union - u2.funcidx = 0; - - // Clear all status flags - callstatus = 0; - } -}; - -// Update factory (lstate.cpp) -CallInfo *luaE_extendCI (lua_State *L) { - CallInfo* ci = new (L) CallInfo(); // ✅ All fields initialized! - ci->setPrevious(L->getCI()); - L->getCI()->setNext(ci); - L->getNCIRef()++; - return ci; -} -``` - -**Testing**: -- Build and run full test suite -- Benchmark (expect no change or slight improvement) -- Verify with MSAN (memory sanitizer) - no uninitialized reads - -**Files to Modify**: -- `src/core/lstate.h` - Add CallInfo constructor -- `src/core/lstate.cpp` - Update `luaE_extendCI` to use `new` - ---- - -#### 1.2 - Add lua_State Constructor - -**Current Problem**: 50+ lines of manual initialization in `preinit_thread()` - -**Solution**: -```cpp -// Add to lua_State class (lstate.h) -class lua_State : public GCBase { -public: - lua_State() noexcept { - // Stack management - stack.p = nullptr; - stack_last.p = nullptr; - top.p = nullptr; - - // Call info - ci = nullptr; - nci = 0; - - // Error handling - status = LUA_OK; - errfunc = 0; - - // Hook management - oldpc = 0; - hookmask = 0; - basehookcount = 0; - hookcount = 0; - hook = nullptr; - - // GC - gclist = nullptr; - - // Upvalue tracking - twups = this; // Points to self initially - - // C call tracking - nCcalls = 0; - - // Misc - allowhook = 1; - nny = 0; - } -}; - -// Simplify preinit_thread (lstate.cpp) -static void preinit_thread (lua_State *L, global_State *g) { - // Constructor already initialized everything to safe defaults! - // Just set the global state link - G(L) = g; -} -``` - -**Alternative Approach** (if linking G is needed in constructor): -```cpp -lua_State(global_State* g) noexcept { - G(this) = g; // Set global state FIRST - // ... initialize all fields ... -} -``` - -**Testing**: -- Create new threads with `lua_newthread` -- Run coroutine tests (`coroutine.lua`) -- Benchmark -- Check with ASAN for any issues - -**Files to Modify**: -- `src/core/lstate.h` - Add lua_State constructor -- `src/core/lstate.cpp` - Simplify `preinit_thread`, update thread creation - ---- - -#### 1.3 - Add global_State Constructor - -**Current Problem**: 50+ lines of manual initialization in `lua_newstate()` - -**Solution**: -```cpp -// Add to global_State class (lstate.h) -class global_State { -public: - global_State() noexcept { - // Memory allocator subsystem - memoryAllocator.setFrealloc(nullptr); // Must be set by caller - memoryAllocator.setUd(nullptr); - memoryAllocator.setTotalBytes(sizeof(global_State)); - memoryAllocator.setGCDebt(0); - - // GC accounting - gcAccounting.setGCEstimate(0); - - // GC parameters - gcParams.setGCPause(LUAI_GCPAUSE); - gcParams.setGCStepMul(LUAI_GCMUL); - gcParams.setGCStepSize(LUAI_GCSTEPSIZE); - gcParams.setGCGenMinorMul(LUAI_GENMINORMUL); - gcParams.setGCMajorMul(LUAI_GCMAJORMUL); - - // GC object lists - gcObjectLists.setAllGC(nullptr); - gcObjectLists.setSweepGC(nullptr); - gcObjectLists.setFinObj(nullptr); - gcObjectLists.setGray(nullptr); - gcObjectLists.setGrayAgain(nullptr); - gcObjectLists.setWeak(nullptr); - gcObjectLists.setEphemeron(nullptr); - gcObjectLists.setAllWeak(nullptr); - gcObjectLists.setTobeFnz(nullptr); - gcObjectLists.setFixedGC(nullptr); - - // String cache - stringCache.setLastMajorMem(0); - - // Type system - typeSystem.setMetaTables({}); // Initialize all to nullptr - typeSystem.setTMCache({}); // Initialize all to 0 - - // Runtime services - runtimeServices.setMainThread(nullptr); - runtimeServices.setPanic(nullptr); - runtimeServices.setWarningFunction(nullptr); - runtimeServices.setWarningData(nullptr); - - // GC state - currentwhite = bitmask(WHITE0BIT); - gcstate = GCSpause; - gckind = KGC_INC; - gcrunning = 0; - gcemergency = 0; - gcstopem = 0; - - // String table - uses its own constructor - // stringtable already has constructor, will init itself - - // Seed (must be set by caller) - seed = 0; - - // Version - version = nullptr; - - // Main thread storage - // mainthread initialized by caller - } -}; - -// Simplify lua_newstate (lstate.cpp) -LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud, unsigned seed) { - global_State *g = new (f, ud) global_State(); // Constructor does heavy lifting! - - // Set values that must come from parameters - g->getMemoryAllocator().setFrealloc(f); - g->getMemoryAllocator().setUd(ud); - g->setSeed(seed); - - L = &g->getMainThread()->l; - g->getRuntimeServices().setMainThread(L); - - // Rest of initialization... - if (L->rawRunProtected(f_luaopen, NULL) != LUA_OK) { - close_state(L); - L = NULL; - } - return L; -} -``` - -**Challenge**: global_State is NOT allocated via GC (uses plain allocator), so we need special placement new: - -```cpp -// Add to lgc.h or lstate.h -inline void* operator new(size_t size, lua_Alloc f, void* ud) { - return (*f)(ud, NULL, LUA_TTHREAD, size); -} -``` - -**Testing**: -- Create new states with `lua_newstate` -- Run all tests -- Benchmark -- Memory leak check - -**Files to Modify**: -- `src/core/lstate.h` - Add global_State constructor, placement new operator -- `src/core/lstate.cpp` - Simplify `lua_newstate` - ---- - -### Phase 2: Use Existing Constructors (P1) 🟡 - -**Estimated Time**: 4-6 hours -**Risk**: Very Low -**Performance Impact**: None - -#### 2.1 - Use Udata Constructor in Factory - -**Current Problem**: Has constructor, factory doesn't use it - -**Solution**: -```cpp -// Update luaS_newudata (lstring.cpp) -Udata *luaS_newudata (lua_State *L, size_t s, unsigned short nuvalue) { - size_t totalsize = sizeudata(nuvalue, s); - - // Use placement new with constructor ✅ - Udata* u = new (L, LUA_VUSERDATA, totalsize - sizeof(Udata)) Udata(); - - // Set values that differ from defaults - u->setNumUserValues(nuvalue); - u->setLen(s); - - // Initialize user values to nil - for (int i = 0; i < nuvalue; i++) - setnilvalue(&u->getUserValue(i)->uv); - - return u; -} -``` - -**Alternative**: Update Udata constructor to take parameters: -```cpp -Udata(size_t len, unsigned short nvalues) noexcept - : nuvalue(nvalues), len(len), metatable(nullptr), gclist(nullptr) { -} -``` - -**Testing**: -- Run userdata tests -- Benchmark -- Check with ASAN - -**Files to Modify**: -- `src/objects/lstring.cpp` - Update `luaS_newudata` -- Optional: `src/objects/lobject.h` - Update Udata constructor - ---- - -#### 2.2 - Improve Table Initialization - -**Current Problem**: Two-phase (constructor + setFlags + setnodevector) - -**Solution**: -```cpp -// Update Table constructor (lobject.h) -class Table : public GCBase { -public: - Table() noexcept { - flags = maskflags; // ✅ Move from factory into constructor - asize = 0; - array = nullptr; - node = nullptr; - metatable = nullptr; - gclist = nullptr; - lsizenode = 0; - - // Node dummy initialization - will be replaced by setnodevector - // but object is in safe state - } -}; - -// Simplify factory (ltable.cpp) -Table* Table::create(lua_State* L) { - Table *t = new (L, LUA_VTABLE) Table(); // Constructor sets flags ✅ - setnodevector(L, t, 0); // Still needed for allocation - return t; -} -``` - -**Testing**: -- Run table tests (`nextvar.lua`, etc.) -- Benchmark -- Check table creation patterns - -**Files to Modify**: -- `src/objects/lobject.h` - Update Table constructor -- `src/objects/ltable.cpp` - Simplify `Table::create` - ---- - -#### 2.3 - Improve LClosure Initialization - -**Current Problem**: Requires separate `initUpvals()` call - -**Options**: - -**Option A**: Keep two-phase (safer for now) -```cpp -// Document the pattern clearly -LClosure* cl = LClosure::create(L, nupvals); -cl->initUpvals(L); // Required second phase -``` - -**Option B**: Integrate into factory (preferred) -```cpp -// Update factory to handle both phases -LClosure* LClosure::create(lua_State* L, int nupvals, bool initUpvals = true) { - size_t total_size = sizeLclosure(nupvals); - size_t extra = total_size - sizeof(LClosure); - LClosure* c = new (L, LUA_VLCL, extra) LClosure(nupvals); - - if (initUpvals) { - c->initUpvals(L); // ✅ Done automatically - } - - return c; -} -``` - -**Testing**: -- Run closure tests (`closure.lua`) -- Check function creation patterns -- Benchmark - -**Files to Modify**: -- `src/objects/lfunc.cpp` - Update LClosure::create - ---- - -### Phase 3: Add Constructors to Simple Classes (P2) 🟢 - -**Estimated Time**: 6-8 hours -**Risk**: Low -**Performance Impact**: None (compile-time classes) - -#### 3.1 - Add Constructors to Parser Support Classes - -**Classes**: Upvaldesc, LocVar, AbsLineInfo - -```cpp -// Upvaldesc (lparser.h) -class Upvaldesc { -public: - Upvaldesc() noexcept { - name = nullptr; - instack = 0; - idx = 0; - kind = 0; - } - - Upvaldesc(TString* name_, lu_byte instack_, lu_byte idx_, lu_byte kind_) noexcept - : name(name_), instack(instack_), idx(idx_), kind(kind_) { - } -}; - -// LocVar (lobject.h) -class LocVar { -public: - LocVar() noexcept { - varname = nullptr; - startpc = 0; - endpc = 0; - } - - LocVar(TString* name, int start, int end) noexcept - : varname(name), startpc(start), endpc(end) { - } -}; - -// AbsLineInfo (lobject.h) -class AbsLineInfo { -public: - AbsLineInfo() noexcept { - pc = 0; - line = 0; - } - - AbsLineInfo(int pc_, int line_) noexcept - : pc(pc_), line(line_) { - } -}; -``` - -**Benefits**: -- Can use aggregate initialization: `LocVar lv{name, start, end};` -- Safe defaults if default-constructed -- Better than manual field-by-field setting - -**Testing**: -- Run parser tests -- Compile test scripts -- No performance impact (compile-time only) - -**Files to Modify**: -- `src/objects/lobject.h` - Add LocVar, AbsLineInfo constructors -- `src/compiler/lparser.h` - Add Upvaldesc constructor -- Update parser code to use constructors where appropriate - ---- - -#### 3.2 - Add stringtable Constructor - -**Current**: Manual initialization via setters - -```cpp -// Add to stringtable class (lstring.h) -class stringtable { -public: - stringtable() noexcept { - hash = nullptr; - nuse = 0; - size = 0; - } -}; -``` - -**Simple and safe**. Used in global_State initialization. - -**Files to Modify**: -- `src/objects/lstring.h` - Add stringtable constructor - ---- - -### Phase 4: Document Complex Cases - -**Estimated Time**: 2-3 hours -**Risk**: None (documentation only) - -#### 4.1 - Document TString Variable-Size Constraints - -Add comments explaining why TString has minimal constructor: - -```cpp -// TString (lobject.h) -class TString : public GCBase { -public: - // Minimal constructor by design: - // TString uses variable-size allocation where short strings may allocate - // LESS than sizeof(TString). Constructor cannot safely initialize fields - // that may not be allocated (contents, falloc, ud for short strings). - // See createstrobj() in lstring.cpp for initialization logic. - TString() noexcept { - // Fields initialized manually in createstrobj() based on actual allocation size - } -}; -``` - -**Files to Modify**: -- `src/objects/lobject.h` - Add documentation comment to TString - ---- - -## Implementation Strategy - -### General Approach - -For each phase: - -1. **Read** - Understand current initialization pattern -2. **Design** - Choose appropriate pattern (A, B, or C) -3. **Implement** - Add constructor + update factory -4. **Build** - Ensure zero warnings -5. **Test** - Run full test suite -6. **Benchmark** - Verify ≤2.21s performance -7. **Commit** - Immediate commit if successful - -### Testing Checklist - -After each constructor addition: - -```bash -# 1. Build with warnings as errors -cmake --build build --clean-first - -# 2. Run full test suite -cd testes && ../build/lua all.lua -# Expected: "final OK !!!" - -# 3. Run 5-iteration benchmark -for i in 1 2 3 4 5; do \ - ../build/lua all.lua 2>&1 | grep "total time:"; \ -done -# Expected: All runs ≤ 2.21s - -# 4. Optional: Sanitizer builds for critical changes -cmake -B build-san -DLUA_ENABLE_ASAN=ON -DLUA_ENABLE_UBSAN=ON -cmake --build build-san -cd testes && ../build-san/lua all.lua -``` - -### Commit Convention - -```bash -git add -git commit -m "Constructor Refactor Phase X.Y: - - -- Add comprehensive constructor initializing all N fields -- Update factory function to use constructor -- Performance: X.XXs (baseline 2.17s) -- Tests: all passing" -``` - ---- - -## Risk Mitigation - -### Performance Risks - -**Risk**: Constructors add overhead -**Mitigation**: -- Use `inline` and `noexcept` on all constructors -- Inline constructors are zero-cost with optimization -- Benchmark after every phase -- Revert immediately if >2.21s - -### Correctness Risks - -**Risk**: Wrong default values break functionality -**Mitigation**: -- Study existing initialization code carefully -- Use existing defaults from factory functions -- Comprehensive testing after each change -- Use sanitizers to catch uninitialized reads - -### Variable-Size Allocation Risks - -**Risk**: Constructor accesses unallocated memory -**Mitigation**: -- Document variable-size constraints clearly -- Constructor only touches fixed-size fields -- Factory method handles variable arrays -- Test with ASAN to catch out-of-bounds access - -### GC Integration Risks - -**Risk**: Constructor interferes with GC metadata -**Mitigation**: -- GCBase handles `next`, `tt`, `marked` fields -- Derived class constructor only touches its own fields -- Test GC thoroughly after changes (gc.lua, gengc.lua) - ---- - -## Success Metrics - -### Code Quality Metrics - -- ✅ All 19 classes have constructors -- ✅ Zero fields left uninitialized -- ✅ All factory functions use constructors -- ✅ Consistent factory pattern: `static T* create(lua_State* L, ...)` -- ✅ Zero compiler warnings - -### Safety Metrics - -- ✅ CallInfo complete initialization (bug fix) -- ✅ MSAN clean (no uninitialized reads) -- ✅ ASAN clean (no memory errors) -- ✅ All test files pass - -### Performance Metrics - -- ✅ Benchmark: ≤2.21s (all 5 runs) -- ✅ No regression vs current performance -- ✅ Maintain or improve on 2.17s baseline - ---- - -## Timeline Estimate - -| Phase | Description | Hours | Priority | -|-------|-------------|-------|----------| -| 1.1 | CallInfo constructor | 2-3 | P0 🔴 | -| 1.2 | lua_State constructor | 3-4 | P0 🔴 | -| 1.3 | global_State constructor | 3-5 | P0 🔴 | -| 2.1 | Use Udata constructor | 1-2 | P1 🟡 | -| 2.2 | Improve Table | 1-2 | P1 🟡 | -| 2.3 | Improve LClosure | 2 | P1 🟡 | -| 3.1 | Parser class constructors | 3-4 | P2 🟢 | -| 3.2 | stringtable constructor | 1 | P2 🟢 | -| 4.1 | Documentation | 2-3 | P2 🟢 | -| **Total** | | **18-28 hours** | | - -**Recommended Schedule**: -- **Week 1**: Phase 1 (P0 - Critical fixes) -- **Week 2**: Phase 2 (P1 - Quick wins) -- **Week 3**: Phase 3-4 (P2 - Polish) - ---- - -## Open Questions - -1. **lua_State constructor parameter** - Should constructor take `global_State*` parameter, or should it be set after construction? - **Recommendation**: Take parameter - cleaner initialization - -2. **LClosure initUpvals** - Integrate into factory or keep two-phase? - **Recommendation**: Integrate with default parameter - -3. **Placement new for global_State** - Where to define the special `operator new` for non-GC allocation? - **Recommendation**: In lstate.h near global_State definition - -4. **TString constructor** - Should we attempt smarter initialization for variable-size strings? - **Recommendation**: No - document constraints, keep manual init - ---- - -## Future Opportunities - -After constructor refactoring is complete: - -1. **Member Initializer Lists** - Convert field initialization to use C++ member initializer lists: - ```cpp - ClassName() noexcept - : field1(default1) - , field2(default2) - , field3(default3) { - } - ``` - Benefits: Potentially more efficient, clearer intent - -2. **Aggregate Initialization** - For POD-like classes (LocVar, AbsLineInfo), enable: - ```cpp - LocVar lv{name, startpc, endpc}; // Direct initialization - ``` - -3. **Constructor Delegation** - For classes with multiple constructors: - ```cpp - ClassName() noexcept : ClassName(default_param) { - } - - ClassName(int param) noexcept { - // Full initialization - } - ``` - -4. **Static Factory Standardization** - Ensure ALL classes have: - ```cpp - static T* create(lua_State* L, ...); - ``` - Even if it just forwards to placement new. - ---- - -## Conclusion - -This refactoring addresses critical safety issues (CallInfo uninitialized fields) while improving code quality and maintainability across all 19 classes. The phased approach with continuous testing ensures zero performance regression while modernizing initialization patterns. - -**Key Benefits**: -- 🔒 **Safety**: Eliminate uninitialized field bugs -- 📖 **Clarity**: Single point of truth for initialization -- 🔧 **Maintainability**: Easy to verify field completeness -- ⚡ **Performance**: Zero-cost with inline constructors -- ✅ **Compatibility**: C API unchanged - -**Next Steps**: -1. Review and approve this plan -2. Begin Phase 1.1 (CallInfo - critical bug fix) -3. Test and benchmark after each change -4. Proceed through phases incrementally - ---- - -**Document Version**: 1.0 -**Last Updated**: 2025-11-15 -**Status**: Awaiting approval to begin implementation diff --git a/docs/CPP_STDLIB_OPPORTUNITIES.md b/docs/CPP_STDLIB_OPPORTUNITIES.md deleted file mode 100644 index 29c911dd..00000000 --- a/docs/CPP_STDLIB_OPPORTUNITIES.md +++ /dev/null @@ -1,583 +0,0 @@ -# C++ Standard Library Opportunities - -**Analysis Date**: 2025-11-16 -**Status**: Opportunities identified, implementation pending -**Performance Constraint**: ≤4.24s (≤1% regression from 4.20s baseline) - ---- - -## Overview - -This document identifies opportunities to replace C standard library usage with modern C++23 standard library equivalents. The project is already using C++23, but many files still use C headers and functions that have better C++ alternatives. - -**Key Benefits**: -- Type safety (no void* casts) -- Exception safety (RAII) -- Better optimization potential -- Clearer intent -- Compiler diagnostics -- Zero-cost abstractions - -**Key Constraints**: -- Must not break performance (≤1% regression) -- Must preserve C API compatibility (public headers unchanged) -- Must integrate with existing Lua allocator system -- Hot-path code requires careful benchmarking - ---- - -## Priority 1: Low-Hanging Fruit (HIGH CONFIDENCE) - -### 1.1 C Headers → C++ Headers - -**Effort**: LOW (1-2 hours) -**Risk**: VERY LOW -**Performance Impact**: None (identical semantics) - -**Current State**: Many files use C headers instead of C++ equivalents. - -**Changes**: -```cpp -// Replace these C headers: -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -#include → #include -``` - -**Files Affected** (~40 files): -- src/vm/lvm.cpp -- src/objects/ltable.cpp -- src/objects/lstring.cpp -- src/objects/lobject.cpp -- src/objects/lfunc.cpp -- src/serialization/lundump.cpp -- src/serialization/ldump.cpp -- src/serialization/lzio.cpp -- src/testing/ltests.cpp -- src/auxiliary/lauxlib.cpp -- src/libraries/*.cpp (all library files) -- And more... - -**Implementation**: -1. Search and replace in all .cpp files -2. Update corresponding .h files if needed -3. Keep public API headers (lua.h, lauxlib.h) unchanged -4. Build and test -5. Benchmark - -**Expected Result**: Zero functional change, cleaner code. - ---- - -### 1.2 INT_MAX/UINT_MAX → std::numeric_limits - -**Effort**: LOW (2-3 hours) -**Risk**: LOW -**Performance Impact**: None (compile-time constants) - -**Current State**: Code uses INT_MAX, UINT_MAX, SIZE_MAX macros from limits.h. - -**Found in**: -- src/objects/ltable.cpp:208, 218-239 -- src/compiler/lparser.cpp -- src/compiler/lcode.cpp -- src/libraries/loslib.cpp -- src/libraries/lstrlib.cpp -- src/libraries/ltablib.cpp -- src/libraries/lutf8lib.cpp -- src/objects/lstring.cpp -- src/serialization/lundump.cpp -- And more... - -**Changes**: -```cpp -// Before -#include -if (ui <= cast_uint(INT_MAX)) - -// After -#include -if (ui <= std::numeric_limits::max()) -``` - -**Additional Benefits**: -```cpp -// Type-safe, constexpr, works for any type -std::numeric_limits::min() -std::numeric_limits::max() -std::numeric_limits::max() -std::numeric_limits::max() -std::numeric_limits::max() -std::numeric_limits::epsilon() -std::numeric_limits::infinity() -std::numeric_limits::quiet_NaN() -``` - -**Implementation**: -1. Add `#include ` to llimits.h -2. Create inline constexpr helpers if needed -3. Replace macros systematically -4. Build and test -5. Benchmark - ---- - -### 1.3 memcpy(dest, src, n * sizeof(char)) → std::copy_n - -**Effort**: MEDIUM (4-6 hours) -**Risk**: MEDIUM (need to verify all uses) -**Performance Impact**: None (compiles to same code) - -**Current State**: 15+ instances of `memcpy` copying characters, often with `sizeof(char)`. - -**Found in**: -- src/objects/lstring.cpp:274, 293 (string creation) -- src/vm/lvm.cpp:706 (concatenation) -- src/auxiliary/lauxlib.cpp:581, 600, 655 (buffer operations) -- src/objects/lobject.cpp:681, 689, 697, 701, 718 (formatting) -- src/libraries/lstrlib.cpp:154, 156, 159 (string library) -- src/testing/ltests.cpp:1474 - -**Changes**: -```cpp -// Before (from lstring.cpp:274) -memcpy(getshrstr(ts), str, l * sizeof(char)); - -// After -std::copy_n(str, l, getshrstr(ts)); - -// Before (from ltable.cpp:656) - copying Value objects -memcpy(np - tomove, op - tomove, tomoveb); - -// After - need std::copy for non-trivial types -std::copy(op - tomove, op, np - tomove); -``` - -**Benefits**: -- Type-safe (no sizeof errors) -- Works with non-trivial types -- Iterator-based (more C++ idiomatic) -- Better compiler diagnostics - -**Implementation Strategy**: -1. Start with char* copies (safest) -2. Use `std::copy_n(src, count, dest)` for simple cases -3. Use `std::copy(begin, end, dest)` for range copies -4. Benchmark each file after changes -5. Consider `std::memcpy` wrapper if needed for POD types - ---- - -## Priority 2: Medium Opportunities (GOOD CANDIDATES) - -### 2.1 memcpy (general) → std::copy / std::memcpy - -**Effort**: MEDIUM (3-4 hours) -**Risk**: MEDIUM -**Performance Impact**: Should be identical for POD types - -**Current State**: -- ltable.cpp:656 - copying Value array -- Additional instances in various files - -**Changes**: -```cpp -// For POD types, use namespaced version -std::memcpy(dest, src, size); - -// For arrays of objects, use algorithms -std::copy(first, last, dest); -std::copy_n(first, count, dest); -std::move(first, last, dest); // for move-capable types -``` - -**Implementation**: -1. Categorize uses by type (POD vs objects) -2. Replace with appropriate algorithm -3. Benchmark hot paths carefully -4. Keep memcpy for truly performance-critical code - ---- - -### 2.2 Manual Loops → std::algorithms - -**Effort**: MEDIUM (varies by case) -**Risk**: LOW -**Performance Impact**: Usually identical or better - -**Potential Candidates** (need to search): -- Loops that could use `std::find`, `std::find_if` -- Loops that could use `std::count`, `std::count_if` -- Loops that could use `std::transform` -- Loops that could use `std::accumulate` -- Loops that could use `std::all_of`, `std::any_of`, `std::none_of` - -**Example Pattern**: -```cpp -// Before -int count = 0; -for (int i = 0; i < n; i++) { - if (predicate(array[i])) count++; -} - -// After -int count = std::count_if(array, array + n, predicate); -``` - -**Implementation**: -1. Search for common loop patterns -2. Identify algorithm replacements -3. Refactor incrementally -4. Benchmark - ---- - -### 2.3 String Operations → std::string_view (read-only) - -**Effort**: MEDIUM-HIGH (6-10 hours) -**Risk**: MEDIUM -**Performance Impact**: Potentially better (no copies) - -**Current State**: Many functions take `const char*` + `size_t` pairs. - -**Potential Benefits**: -- Single parameter instead of two -- No null-termination requirement -- Substring operations without allocation -- Standard string algorithms - -**Example**: -```cpp -// Before -void processString(const char* str, size_t len); - -// After -void processString(std::string_view str); - -// Call sites unchanged if using C++17 deduction -processString({str, len}); -``` - -**Caution**: -- Need to ensure Lua strings are valid for lifetime -- Public API must remain unchanged -- May not be suitable for all cases - -**Implementation**: -1. Start with internal utility functions -2. Gradually expand to more code -3. Never change public API -4. Benchmark carefully - ---- - -## Priority 3: Advanced Opportunities (CONSIDER CAREFULLY) - -### 3.1 std::array for Fixed-Size Arrays - -**Effort**: MEDIUM -**Risk**: LOW-MEDIUM -**Performance Impact**: None (same layout as C array) - -**Benefits**: -- Bounds checking in debug mode -- Size information embedded -- Standard container interface -- Works with algorithms - -**Candidates** (need to identify): -- Fixed-size stack buffers -- Character buffers for formatting -- Small lookup tables - -**Example**: -```cpp -// Before -char buff[100]; - -// After -std::array buff; -// OR -constexpr size_t BUFF_SIZE = 100; -std::array buff; -``` - -**Implementation**: -1. Search for fixed-size array declarations -2. Evaluate case-by-case -3. Replace where beneficial -4. Keep C arrays for hot paths or C compatibility - ---- - -### 3.2 std::optional for Nullable Returns - -**Effort**: MEDIUM-HIGH -**Risk**: MEDIUM -**Performance Impact**: Usually none (same size as T* for pointers) - -**Benefits**: -- Explicit nullability -- Type-safe -- Monadic operations (transform, and_then, or_else) -- No sentinel values - -**Current Patterns** (need to identify): -- Functions returning nullptr on failure -- Functions using -1/0 as sentinel -- Out-parameters for optional results - -**Example**: -```cpp -// Before -TString* findString(const char* str, size_t len) { - // returns nullptr if not found -} - -// After -std::optional findString(std::string_view str) { - // returns std::nullopt if not found -} -``` - -**Caution**: -- Cannot change public API -- May not be worth overhead for hot paths -- Need to consider existing error handling - ---- - -### 3.3 std::span for Array Views - -**Effort**: MEDIUM-HIGH -**Risk**: MEDIUM -**Performance Impact**: None (just pointer + size) - -**Benefits**: -- Single parameter for array + size -- Bounds checking support -- Subspan operations -- Standard interface - -**Candidates** (need to identify): -- Functions taking `T*` + `size_t` -- Array parameters -- Buffer operations - -**Example**: -```cpp -// Before -void processArray(const Value* array, size_t size); - -// After -void processArray(std::span array); - -// Can create subviews -auto subset = array.subspan(offset, count); -``` - -**Implementation**: -1. Identify array+size parameter pairs -2. Start with internal functions -3. Use std::span for new code -4. Benchmark - ---- - -### 3.4 std::variant for Tagged Unions - -**Effort**: HIGH -**Risk**: HIGH -**Performance Impact**: Potentially negative (vtable overhead?) - -**Current State**: TValue uses tagged union pattern with tt field. - -**Caution**: -- TValue is in hot path, performance critical -- Current implementation is well-optimized -- std::variant adds safety but may add overhead -- **DO NOT CHANGE without extensive benchmarking** - -**Recommendation**: Keep current implementation. The tagged union pattern is appropriate for Lua's value representation. - ---- - -## Priority 4: Already Using (Continue Expanding) - -### 4.1 LuaVector (std::vector wrapper) ✅ - -**Status**: Already implemented -**Location**: src/memory/LuaVector.h -**Usage**: Integrates std::vector with Lua allocator - -**Opportunities**: -- Identify more places to use LuaVector -- Document usage patterns -- Ensure consistent adoption in new code - ---- - -### 4.2 LuaAllocator ✅ - -**Status**: Already implemented -**Location**: src/memory/luaallocator.h -**Usage**: C++ allocator interface for Lua memory management - -**Opportunities**: -- Use with more STL containers -- Consider std::unordered_map, std::set with LuaAllocator -- Document integration patterns - ---- - -### 4.3 Exceptions ✅ - -**Status**: Already using -**Usage**: Replaced setjmp/longjmp with C++ exceptions - -**Opportunities**: -- Ensure all error paths use exceptions -- Document exception guarantees -- Consider std::exception hierarchy - ---- - -## Implementation Roadmap - -### Phase 1: C Headers (SAFE, HIGH VALUE) -**Estimated Time**: 2-3 hours -**Risk**: Very Low -**Tasks**: -1. Replace all C headers with C++ equivalents -2. Build and test -3. Benchmark -4. Commit - -### Phase 2: std::numeric_limits (SAFE, MEDIUM VALUE) -**Estimated Time**: 3-4 hours -**Risk**: Low -**Tasks**: -1. Add std::numeric_limits helpers to llimits.h -2. Replace INT_MAX, UINT_MAX, SIZE_MAX usage -3. Build and test -4. Benchmark -5. Commit - -### Phase 3: memcpy → std::copy_n (char copies) (MEDIUM RISK, MEDIUM VALUE) -**Estimated Time**: 4-6 hours -**Risk**: Medium -**Tasks**: -1. Replace `memcpy(dest, src, n * sizeof(char))` with `std::copy_n` -2. Do one file at a time -3. Benchmark after each file -4. Commit successful changes -5. Revert if performance regression - -### Phase 4: Evaluate Additional Opportunities (RESEARCH) -**Estimated Time**: 4-6 hours -**Risk**: N/A -**Tasks**: -1. Search for algorithm opportunities -2. Identify std::array candidates -3. Document findings -4. Prioritize next phases - -### Phase 5+: Based on Phase 4 results - ---- - -## Performance Testing Protocol - -For each change: - -1. **Build**: - ```bash - cmake --build build --clean-first - ``` - -2. **Functional Test**: - ```bash - cd testes && ../build/lua all.lua - # Expected: "final OK !!!" - ``` - -3. **Benchmark** (5 runs): - ```bash - cd testes - for i in 1 2 3 4 5; do \ - ../build/lua all.lua 2>&1 | grep "total time:"; \ - done - ``` - -4. **Evaluate**: - - Target: ≤4.24s (≤1% regression from 4.20s) - - If >4.24s: REVERT immediately - - If ≤4.20s: Excellent! - - If 4.20-4.24s: Acceptable - -5. **Commit** (if passed): - ```bash - git add - git commit -m "Phase N: Use C++ stdlib - [specific change]" - ``` - ---- - -## Files by Priority - -### High Priority (start here): -1. **src/memory/llimits.h** - Add std::limits, update header includes -2. **src/objects/lstring.cpp** - memcpy → std::copy_n (3 instances) -3. **src/objects/ltable.cpp** - INT_MAX → std::numeric_limits, memcpy review -4. **src/objects/lobject.cpp** - memcpy → std::copy_n (5 instances) -5. **src/auxiliary/lauxlib.cpp** - memcpy → std::copy_n (3 instances) - -### Medium Priority: -6. **src/vm/lvm.cpp** - Header cleanup, memcpy review -7. **src/libraries/lstrlib.cpp** - memcpy → std::copy_n (3 instances) -8. **src/serialization/** - Header cleanup -9. **src/compiler/** - std::numeric_limits - -### Lower Priority: -10. **src/testing/ltests.cpp** - Test code, less critical - ---- - -## Summary - -**Total Identified Opportunities**: ~100+ changes across 40+ files - -**Immediate Low-Hanging Fruit**: -1. ✅ C headers → C++ headers (~40 files, 2-3 hours) -2. ✅ INT_MAX → std::numeric_limits (~20 files, 3-4 hours) -3. ✅ memcpy (char) → std::copy_n (~15 instances, 4-6 hours) - -**Total Quick Wins**: ~10 hours of work, high confidence, low risk - -**Next Steps**: -1. Start with Phase 1 (C headers) -2. Proceed to Phase 2 (numeric_limits) -3. Carefully approach Phase 3 (memcpy) -4. Research and document Phase 4+ - -**Expected Benefits**: -- More idiomatic C++23 code -- Better type safety -- Clearer intent -- Improved compiler diagnostics -- Foundation for future modernization -- Zero performance regression - ---- - -**Last Updated**: 2025-11-16 -**Status**: Ready for implementation -**Next Action**: Begin Phase 1 (C headers replacement) diff --git a/docs/CUSTOM_ALLOCATOR_PLAN.md b/docs/CUSTOM_ALLOCATOR_PLAN.md deleted file mode 100644 index 55bd7919..00000000 --- a/docs/CUSTOM_ALLOCATOR_PLAN.md +++ /dev/null @@ -1,1648 +0,0 @@ -# Custom Allocator Implementation Plan for Lua C++ - -**Created**: 2025-11-15 -**Status**: Planning Phase -**Risk Level**: HIGH (affects GC and core performance) -**Performance Target**: ≤2.21s (≤1% regression from 2.17s baseline) - ---- - -## Table of Contents - -1. [Executive Summary](#executive-summary) -2. [Allocator Interface Requirements](#allocator-interface-requirements) -3. [GC Integration Points](#gc-integration-points) -4. [Allocator Design Patterns](#allocator-design-patterns) -5. [Implementation Roadmap](#implementation-roadmap) -6. [Testing & Validation](#testing--validation) -7. [Performance Considerations](#performance-considerations) -8. [Common Pitfalls & Best Practices](#common-pitfalls--best-practices) - ---- - -## Executive Summary - -### What is a Custom Allocator? - -A custom allocator allows you to replace Lua's default `malloc/realloc/free` with specialized memory management strategies optimized for specific use cases: - -- **Pool allocators** - Fast allocation for fixed-size objects -- **Arena allocators** - Batch deallocation for temporary objects -- **Tracking allocators** - Debug memory leaks and track usage -- **Custom backends** - Integration with game engines, embedded systems, etc. - -### Key Requirements - -✅ **Must implement** the `lua_Alloc` signature -✅ **Must preserve** GC debt accounting invariants -✅ **Must be reentrant** (GC can allocate during collection) -✅ **Must handle** allocation, reallocation, and deallocation -✅ **Must maintain** performance (≤2.21s target) -✅ **Must preserve** C API compatibility - -### Critical Constraints - -⚠️ **CANNOT break** GC invariants (debt tracking, accounting) -⚠️ **CANNOT assume** single-threaded access (use locks if needed) -⚠️ **CANNOT fail** deallocation (nsize=0 must always succeed) -⚠️ **CANNOT ignore** emergency GC (allocator may be called during GC) - ---- - -## Allocator Interface Requirements - -### 1. Core Signature - -```cpp -typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); -``` - -**Location**: `/home/user/lua_cpp/include/lua.h:161` - -### 2. Three Operations in One - -The allocator must handle three distinct operations based on parameters: - -| Operation | ptr | osize | nsize | Return | Can Fail? | -|-----------|-----|-------|-------|--------|-----------| -| **Allocate** | NULL | tag | > 0 | New block or NULL | ✅ Yes | -| **Reallocate** | ≠ NULL | > 0 | > 0 | New block or NULL | ✅ Yes | -| **Deallocate** | ≠ NULL | > 0 | 0 | NULL | ❌ Never | - -### 3. Semantic Invariants - -**MUST preserve**: -```cpp -// Invariant 1: NULL pointer iff zero size -(osize == 0) == (ptr == NULL) - -// Invariant 2: NULL pointer iff zero new size -(nsize == 0) == (return_value == NULL) - -// Invariant 3: Deallocation always succeeds -if (nsize == 0) return NULL; // Never fail, always return NULL - -// Invariant 4: Reallocation preserves content -// When reallocating, copy min(osize, nsize) bytes from old to new -``` - -### 4. Parameter Semantics - -**`ud` (user data)**: -- Opaque context pointer passed to allocator -- Set via `lua_newstate(alloc, ud, seed)` or `lua_setallocf(L, alloc, ud)` -- Can store allocator state (e.g., pool manager, stats tracker) -- **NOT modified** by Lua - -**`ptr` (pointer)**: -- NULL for allocation -- Valid pointer for reallocation/deallocation -- Points to previously allocated block -- **MUST match** a previous return value from this allocator - -**`osize` (old size)**: -- 0 for allocation -- Actual size for reallocation/deallocation -- **Special case**: For allocation, may be a **tag** indicating object type: - - `LUA_TSTRING` (4) - String allocation - - `LUA_TTABLE` (5) - Table allocation - - `LUA_TFUNCTION` (6) - Function allocation - - `LUA_TUSERDATA` (7) - Userdata allocation - - `LUA_TTHREAD` (8) - Thread allocation - - Other values indicate non-GC allocations - -**`nsize` (new size)**: -- 0 for deallocation -- > 0 for allocation/reallocation -- Requested size in bytes -- **Allocator may return block larger than nsize** (but Lua won't use extra space) - -### 5. Reference Implementation - -```cpp -// Default allocator from lauxlib.cpp:1049 -static void *l_alloc (void *ud, void *ptr, size_t osize, size_t nsize) { - (void)ud; (void)osize; /* not used */ - if (nsize == 0) { - free(ptr); - return NULL; - } - else - return realloc(ptr, nsize); -} -``` - -**Note**: Default allocator ignores `ud` and `osize`. Custom allocators can use these! - ---- - -## GC Integration Points - -### 1. GC Debt Accounting - -**Critical**: Lua tracks memory via **GC debt** mechanism. - -**Location**: `src/core/lstate.h:665-672` (GCAccounting subsystem) - -```cpp -class GCAccounting { -private: - l_mem totalbytes; /* total bytes allocated + debt */ - l_mem debt; /* bytes to be collected (can be negative = credit) */ - // ... -}; -``` - -**How it works**: -```cpp -// On allocation (lmem.cpp:212) -g->getGCDebtRef() -= cast(l_mem, size); // Increases debt - -// On deallocation (lmem.cpp:154) -g->getGCDebtRef() += cast(l_mem, osize); // Decreases debt (credit) - -// On reallocation (lmem.cpp:187) -g->getGCDebtRef() -= cast(l_mem, nsize) - cast(l_mem, osize); -``` - -**⚠️ CRITICAL**: Custom allocator **MUST NOT** modify GC debt directly! -Lua handles this in `luaM_malloc_`, `luaM_realloc_`, and `luaM_free_`. - -### 2. Emergency Collection - -When allocation fails, Lua triggers emergency GC: - -**Location**: `src/memory/lmem.cpp:162-170` - -```cpp -static void *tryagain (lua_State *L, void *block, size_t osize, size_t nsize) { - global_State *g = G(L); - if (cantryagain(g)) { - luaC_fullgc(L, 1); /* try to free some memory... */ - return callfrealloc(g, block, osize, nsize); /* try again */ - } - else return NULL; /* cannot run an emergency collection */ -} -``` - -**Implications for custom allocators**: -1. **Your allocator WILL be called recursively** during GC -2. **Must be reentrant** - No global state without locks -3. **Must not deadlock** - If using locks, be careful with GC -4. **Can refuse allocation** - Return NULL to trigger emergency GC - -### 3. GC Triggers - -GC runs when `debt > threshold`: - -**Location**: `src/memory/lgc.h:42-47` - -```cpp -#define LUAI_GCPAUSE 250 /* GC pause: 250% (runs at 2.5x memory) */ -#define LUAI_GCMUL 200 /* GC speed: 200% work for each 1% allocation */ -#define LUAI_GCSTEPSIZE 3200 /* Step size: ~200 * sizeof(Table) */ -``` - -**Custom allocator impact**: -- Faster allocation → More frequent GC -- Slower allocation → Less frequent GC -- **Must profile** to ensure GC timing is reasonable - -### 4. Memory Accounting Flow - -``` -User code calls Lua API - ↓ -Lua needs memory (e.g., new table) - ↓ -Calls luaM_malloc_(L, size, LUA_TTABLE) - ↓ -Calls custom allocator: alloc(ud, NULL, LUA_TTABLE, size) - ↓ -Custom allocator returns block or NULL - ↓ -If NULL: tryagain() → luaC_fullgc() → retry allocation - ↓ -If still NULL: luaM_error() → throws exception - ↓ -If success: Update GC debt: debt -= size - ↓ -Return block to caller -``` - ---- - -## Allocator Design Patterns - -### Pattern 1: Tracking Allocator - -**Use case**: Debug memory leaks, profile allocation patterns - -```cpp -struct TrackingAllocator { - lua_Alloc base_alloc; // Underlying allocator - void* base_ud; // Underlying user data - - // Statistics - size_t total_allocated; - size_t total_freed; - size_t current_usage; - size_t peak_usage; - - // Allocation tracking - std::unordered_map allocations; - - struct AllocInfo { - size_t size; - int tag; // Object type tag - const char* location; // Optional: stack trace - }; -}; - -static void* tracking_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - auto* tracker = static_cast(ud); - - // Delegate to base allocator - void* result = tracker->base_alloc(tracker->base_ud, ptr, osize, nsize); - - // Update statistics - if (nsize == 0) { - // Deallocation - tracker->total_freed += osize; - tracker->current_usage -= osize; - tracker->allocations.erase(ptr); - } else if (ptr == NULL) { - // Allocation - tracker->total_allocated += nsize; - tracker->current_usage += nsize; - if (tracker->current_usage > tracker->peak_usage) - tracker->peak_usage = tracker->current_usage; - - if (result != NULL) { - tracker->allocations[result] = {nsize, static_cast(osize), nullptr}; - } - } else { - // Reallocation - tracker->total_allocated += nsize; - tracker->total_freed += osize; - tracker->current_usage += (nsize - osize); - if (tracker->current_usage > tracker->peak_usage) - tracker->peak_usage = tracker->current_usage; - - tracker->allocations.erase(ptr); - if (result != NULL) { - tracker->allocations[result] = {nsize, 0, nullptr}; - } - } - - return result; -} -``` - -**Pros**: -- ✅ Simple wrapper pattern -- ✅ No GC impact (delegates to base allocator) -- ✅ Rich debugging information - -**Cons**: -- ❌ Memory overhead for tracking map -- ❌ Performance overhead for map operations -- ⚠️ Must handle reentrant calls (use locks or lock-free structures) - ---- - -### Pattern 2: Pool Allocator - -**Use case**: Fast allocation for fixed-size objects (e.g., TString, UpVal) - -```cpp -template -class PoolAllocator { -private: - struct Block { - union { - Block* next; // When free - alignas(16) char data[BlockSize]; // When allocated - }; - }; - - Block blocks[BlockCount]; - Block* free_list; - lua_Alloc fallback_alloc; // For sizes != BlockSize - void* fallback_ud; - -public: - PoolAllocator(lua_Alloc fallback, void* ud) - : fallback_alloc(fallback), fallback_ud(ud) { - // Initialize free list - free_list = &blocks[0]; - for (size_t i = 0; i < BlockCount - 1; i++) { - blocks[i].next = &blocks[i + 1]; - } - blocks[BlockCount - 1].next = nullptr; - } - - void* allocate(size_t size) { - if (size != BlockSize || free_list == nullptr) { - // Fall back to base allocator - return fallback_alloc(fallback_ud, nullptr, 0, size); - } - - // Pop from free list - Block* block = free_list; - free_list = block->next; - return block->data; - } - - void deallocate(void* ptr, size_t size) { - // Check if ptr is in our pool - if (ptr >= blocks && ptr < blocks + BlockCount) { - Block* block = reinterpret_cast(ptr); - block->next = free_list; - free_list = block; - } else { - // Fall back to base allocator - fallback_alloc(fallback_ud, ptr, size, 0); - } - } -}; - -static void* pool_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - auto* pool = static_cast*>(ud); - - if (nsize == 0) { - // Deallocation - pool->deallocate(ptr, osize); - return nullptr; - } else if (ptr == nullptr) { - // Allocation - return pool->allocate(nsize); - } else { - // Reallocation: allocate new, copy, free old - void* new_block = pool->allocate(nsize); - if (new_block != nullptr) { - memcpy(new_block, ptr, osize < nsize ? osize : nsize); - pool->deallocate(ptr, osize); - } - return new_block; - } -} -``` - -**Pros**: -- ✅ O(1) allocation/deallocation -- ✅ No fragmentation for fixed-size blocks -- ✅ Cache-friendly (contiguous memory) - -**Cons**: -- ❌ Only efficient for fixed sizes -- ❌ Memory overhead (pre-allocated pool) -- ⚠️ Must handle non-pool sizes (fallback required) - -**Recommended sizes**: -- **32 bytes**: Small strings (TString with ≤15 char payload) -- **64 bytes**: UpVal, small closures -- **128 bytes**: Small tables, CallInfo -- **256 bytes**: Medium tables, function prototypes - ---- - -### Pattern 3: Arena Allocator - -**Use case**: Batch allocation/deallocation (e.g., parser, compiler temporaries) - -```cpp -class ArenaAllocator { -private: - struct Arena { - char* base; - char* current; - size_t size; - Arena* next; - }; - - Arena* current_arena; - size_t arena_size; - lua_Alloc fallback_alloc; - void* fallback_ud; - -public: - ArenaAllocator(size_t arena_sz, lua_Alloc fallback, void* ud) - : arena_size(arena_sz), fallback_alloc(fallback), fallback_ud(ud) { - current_arena = create_arena(); - } - - void* allocate(size_t size) { - // Align to 16 bytes - size = (size + 15) & ~15; - - if (current_arena->current + size > current_arena->base + current_arena->size) { - // Need new arena - if (size > arena_size) { - // Allocation too large for arena, use fallback - return fallback_alloc(fallback_ud, nullptr, 0, size); - } - current_arena = create_arena(); - } - - void* result = current_arena->current; - current_arena->current += size; - return result; - } - - void reset() { - // Reset all arenas (fast batch deallocation) - Arena* arena = current_arena; - while (arena) { - arena->current = arena->base; - arena = arena->next; - } - } - -private: - Arena* create_arena() { - Arena* arena = static_cast( - fallback_alloc(fallback_ud, nullptr, 0, sizeof(Arena) + arena_size) - ); - arena->base = reinterpret_cast(arena + 1); - arena->current = arena->base; - arena->size = arena_size; - arena->next = current_arena; - return arena; - } -}; - -static void* arena_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - auto* arena = static_cast(ud); - - if (nsize == 0) { - // Arena doesn't support individual deallocation - // (Could track to implement, but defeats the purpose) - return nullptr; - } else if (ptr == nullptr) { - // Allocation - return arena->allocate(nsize); - } else { - // Reallocation: allocate new, copy, ignore old - void* new_block = arena->allocate(nsize); - if (new_block != nullptr) { - memcpy(new_block, ptr, osize < nsize ? osize : nsize); - } - return new_block; - } -} -``` - -**Pros**: -- ✅ Extremely fast allocation (bump pointer) -- ✅ Extremely fast batch deallocation (reset) -- ✅ Great for temporaries (parser/compiler) - -**Cons**: -- ❌ Cannot free individual allocations -- ❌ Memory waste if arena not filled -- ⚠️ **NOT suitable** for main Lua allocator (GC requires individual frees) -- ⚠️ Use only for isolated subsystems (e.g., parser-only state) - ---- - -### Pattern 4: Tiered Allocator - -**Use case**: Combine multiple strategies (pools + fallback) - -```cpp -class TieredAllocator { -private: - PoolAllocator<32, 2048> pool_32; - PoolAllocator<64, 1024> pool_64; - PoolAllocator<128, 512> pool_128; - PoolAllocator<256, 256> pool_256; - - lua_Alloc fallback_alloc; - void* fallback_ud; - -public: - void* allocate(size_t size) { - if (size <= 32) return pool_32.allocate(size); - if (size <= 64) return pool_64.allocate(size); - if (size <= 128) return pool_128.allocate(size); - if (size <= 256) return pool_256.allocate(size); - return fallback_alloc(fallback_ud, nullptr, 0, size); - } - - void deallocate(void* ptr, size_t size) { - if (size <= 32) pool_32.deallocate(ptr, size); - else if (size <= 64) pool_64.deallocate(ptr, size); - else if (size <= 128) pool_128.deallocate(ptr, size); - else if (size <= 256) pool_256.deallocate(ptr, size); - else fallback_alloc(fallback_ud, ptr, size, 0); - } -}; -``` - -**Pros**: -- ✅ Combines benefits of multiple strategies -- ✅ Handles diverse allocation patterns -- ✅ Can optimize for specific object sizes - -**Cons**: -- ❌ More complex implementation -- ❌ Higher memory overhead (multiple pools) -- ⚠️ Requires profiling to tune pool sizes - ---- - -### Pattern 5: Tagged Allocator - -**Use case**: Different strategies per object type (using `osize` tag) - -```cpp -class TaggedAllocator { -private: - PoolAllocator string_pool; // LUA_TSTRING - PoolAllocator table_pool; // LUA_TTABLE - lua_Alloc fallback_alloc; - void* fallback_ud; - -public: - void* allocate(size_t size, int tag) { - switch (tag) { - case LUA_TSTRING: - if (size <= sizeof(TString) + 16) - return string_pool.allocate(size); - break; - case LUA_TTABLE: - if (size == sizeof(Table)) - return table_pool.allocate(size); - break; - // Other object types... - } - return fallback_alloc(fallback_ud, nullptr, 0, size); - } -}; - -static void* tagged_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - auto* alloc = static_cast(ud); - - if (nsize == 0) { - // Deallocation - use size to determine pool - alloc->deallocate(ptr, osize); - return nullptr; - } else if (ptr == nullptr) { - // Allocation - osize is the tag! - return alloc->allocate(nsize, static_cast(osize)); - } else { - // Reallocation - void* new_block = alloc->allocate(nsize, 0); // No tag for realloc - if (new_block) { - memcpy(new_block, ptr, osize < nsize ? osize : nsize); - alloc->deallocate(ptr, osize); - } - return new_block; - } -} -``` - -**Pros**: -- ✅ Optimizes per object type -- ✅ Leverages Lua's tagging information -- ✅ Can specialize for common types - -**Cons**: -- ⚠️ Tag only available on initial allocation (not realloc) -- ⚠️ Must handle untagged reallocations -- ❌ More complex than simple pools - ---- - -## Implementation Roadmap - -### Phase 1: Research & Profiling (4-8 hours) - -**Goal**: Understand allocation patterns in your use case - -**Tasks**: -1. ✅ Implement tracking allocator (Pattern 1) -2. ✅ Run test suite with tracking enabled -3. ✅ Analyze allocation patterns: - - Size distribution (histogram) - - Object type frequencies - - Allocation/deallocation pairs - - Peak memory usage - - Allocation hotspots (if tracking call stacks) - -**Deliverables**: -- Allocation profile report -- Identified optimization opportunities -- Chosen allocator strategy - -**Example analysis**: -```bash -# Build with tracking allocator -cmake -B build -DCMAKE_BUILD_TYPE=Release -DLUA_CUSTOM_ALLOCATOR=tracking -cmake --build build - -# Run tests with tracking -cd testes -../build/lua all.lua - -# Analyze results -# Expected output: allocation statistics, size distribution, etc. -``` - ---- - -### Phase 2: Basic Custom Allocator (8-12 hours) - -**Goal**: Implement chosen allocator strategy - -**Tasks**: -1. ✅ Create allocator implementation file (`src/memory/custom_alloc.cpp`) -2. ✅ Implement allocator interface -3. ✅ Add configuration options (pool sizes, etc.) -4. ✅ Implement fallback to default allocator -5. ✅ Add debug logging (conditional compilation) -6. ✅ Add unit tests - -**File structure**: -``` -src/memory/ -├── custom_alloc.h - Public interface -├── custom_alloc.cpp - Implementation -└── custom_alloc_test.cpp - Unit tests (optional) -``` - -**Example header**: -```cpp -// src/memory/custom_alloc.h -#ifndef custom_alloc_h -#define custom_alloc_h - -#include "lua.h" - -/* Custom allocator configuration */ -struct CustomAllocConfig { - size_t pool_32_count; // Pool for 32-byte blocks - size_t pool_64_count; // Pool for 64-byte blocks - size_t pool_128_count; // Pool for 128-byte blocks - lua_Alloc fallback; // Fallback allocator - void* fallback_ud; // Fallback user data -}; - -/* Create custom allocator context */ -void* custom_alloc_create(const CustomAllocConfig* config); - -/* Destroy custom allocator context */ -void custom_alloc_destroy(void* ud); - -/* Allocator function (use with lua_newstate) */ -void* custom_alloc(void* ud, void* ptr, size_t osize, size_t nsize); - -/* Get statistics */ -void custom_alloc_stats(void* ud, size_t* total_alloc, size_t* total_free, - size_t* current, size_t* peak); - -#endif -``` - ---- - -### Phase 3: Integration (4-6 hours) - -**Goal**: Integrate with Lua initialization - -**Tasks**: -1. ✅ Add CMake option for custom allocator -2. ✅ Modify `luaL_newstate` to use custom allocator (optional path) -3. ✅ Add runtime configuration (environment variables, config file) -4. ✅ Add documentation - -**CMake integration**: -```cmake -# CMakeLists.txt -option(LUA_CUSTOM_ALLOCATOR "Enable custom memory allocator" OFF) - -if(LUA_CUSTOM_ALLOCATOR) - target_compile_definitions(lua_static PRIVATE LUA_USE_CUSTOM_ALLOC) - target_sources(lua_static PRIVATE src/memory/custom_alloc.cpp) -endif() -``` - -**Code integration**: -```cpp -// src/auxiliary/lauxlib.cpp -LUALIB_API lua_State *luaL_newstate (unsigned seed) { -#ifdef LUA_USE_CUSTOM_ALLOC - CustomAllocConfig config = { - .pool_32_count = 2048, - .pool_64_count = 1024, - .pool_128_count = 512, - .fallback = l_alloc, - .fallback_ud = nullptr - }; - void* custom_ud = custom_alloc_create(&config); - lua_State *L = lua_newstate(custom_alloc, custom_ud, seed); -#else - lua_State *L = lua_newstate(l_alloc, NULL, seed); -#endif - if (l_likely(L)) { - lua_atpanic(L, &panic); - lua_setwarnf(L, warnfoff, L); /* default is warnings off */ - } - return L; -} -``` - ---- - -### Phase 4: Testing (8-12 hours) - -**Goal**: Validate correctness and performance - -**Tasks**: -1. ✅ Run full test suite (testes/all.lua) -2. ✅ Benchmark performance (5-run average) -3. ✅ Test edge cases: - - Large allocations (> pool size) - - Reallocation across pool boundaries - - Emergency GC triggering - - Out-of-memory conditions -4. ✅ Stress testing: - - Repeated allocate/free cycles - - Fragmentation testing - - Long-running programs -5. ✅ Memory leak detection (valgrind, ASAN) - -**Test script**: -```bash -#!/bin/bash -# test_custom_alloc.sh - -echo "Building with custom allocator..." -cmake -B build -DCMAKE_BUILD_TYPE=Release -DLUA_CUSTOM_ALLOCATOR=ON -cmake --build build - -echo "Running test suite..." -cd testes -../build/lua all.lua -if [ $? -ne 0 ]; then - echo "FAILED: Test suite failed" - exit 1 -fi - -echo "Benchmarking (5 runs)..." -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done - -echo "Running with ASAN..." -cmake -B build_asan -DCMAKE_BUILD_TYPE=Debug \ - -DLUA_CUSTOM_ALLOCATOR=ON \ - -DLUA_ENABLE_ASAN=ON -cmake --build build_asan -cd testes -../build_asan/lua all.lua - -echo "Running with valgrind..." -valgrind --leak-check=full --show-leak-kinds=all \ - ../build/lua all.lua > /dev/null 2> valgrind.log -if grep -q "definitely lost" valgrind.log; then - echo "FAILED: Memory leaks detected" - cat valgrind.log - exit 1 -fi - -echo "All tests passed!" -``` - ---- - -### Phase 5: Performance Tuning (8-16 hours) - -**Goal**: Optimize for performance target (≤2.21s) - -**Tasks**: -1. ✅ Profile with perf/gprof -2. ✅ Identify bottlenecks -3. ✅ Tune pool sizes based on profiling -4. ✅ Optimize hot paths (inline, cache alignment) -5. ✅ A/B test different configurations -6. ✅ Document final configuration - -**Performance checklist**: -- [ ] Benchmark ≤ 2.21s (target met) -- [ ] No memory leaks (valgrind clean) -- [ ] No ASAN errors -- [ ] All tests passing -- [ ] Peak memory usage acceptable -- [ ] Fragmentation acceptable (< 10% waste) - ---- - -### Phase 6: Documentation & Deployment (4-6 hours) - -**Goal**: Document and deploy - -**Tasks**: -1. ✅ Write user documentation -2. ✅ Write developer documentation -3. ✅ Add configuration guide -4. ✅ Update CLAUDE.md -5. ✅ Create pull request -6. ✅ Code review - -**Documentation sections**: -- Overview and rationale -- Configuration options -- Performance characteristics -- Limitations and trade-offs -- Troubleshooting guide - ---- - -## Testing & Validation - -### Test Strategy - -#### 1. Correctness Testing - -**Unit tests** (src/memory/custom_alloc_test.cpp): -```cpp -#include "custom_alloc.h" -#include - -void test_basic_alloc() { - CustomAllocConfig config = { /* ... */ }; - void* ud = custom_alloc_create(&config); - - // Test allocation - void* p = custom_alloc(ud, nullptr, 0, 32); - assert(p != nullptr); - - // Test deallocation - custom_alloc(ud, p, 32, 0); - - // Test reallocation - p = custom_alloc(ud, nullptr, 0, 32); - void* p2 = custom_alloc(ud, p, 32, 64); - assert(p2 != nullptr); - - custom_alloc_destroy(ud); -} - -void test_edge_cases() { - CustomAllocConfig config = { /* ... */ }; - void* ud = custom_alloc_create(&config); - - // Test zero-size allocation - void* p = custom_alloc(ud, nullptr, 0, 0); - assert(p == nullptr); - - // Test double-free protection (should not crash) - p = custom_alloc(ud, nullptr, 0, 32); - custom_alloc(ud, p, 32, 0); - // Don't free again - custom allocator should handle gracefully - - // Test large allocation (> all pools) - p = custom_alloc(ud, nullptr, 0, 1024*1024); - assert(p != nullptr); - custom_alloc(ud, p, 1024*1024, 0); - - custom_alloc_destroy(ud); -} -``` - -**Integration tests** (use existing testes/ suite): -```bash -# All tests must pass -cd testes && ../build/lua all.lua -# Expected: "final OK !!!" -``` - ---- - -#### 2. Performance Testing - -**Benchmark script**: -```bash -#!/bin/bash -# benchmark.sh - -echo "Benchmarking default allocator..." -cmake -B build_default -DCMAKE_BUILD_TYPE=Release -cmake --build build_default -cd testes -echo "Default allocator (5 runs):" -for i in 1 2 3 4 5; do - ../build_default/lua all.lua 2>&1 | grep "total time:" -done - -echo "" -echo "Benchmarking custom allocator..." -cd .. -cmake -B build_custom -DCMAKE_BUILD_TYPE=Release -DLUA_CUSTOM_ALLOCATOR=ON -cmake --build build_custom -cd testes -echo "Custom allocator (5 runs):" -for i in 1 2 3 4 5; do - ../build_custom/lua all.lua 2>&1 | grep "total time:" -done -``` - -**Performance criteria**: -- ✅ **MUST**: Average ≤ 2.21s (≤1% regression from 2.17s baseline) -- ✅ **SHOULD**: Variance < 0.05s (stable performance) -- ✅ **NICE TO HAVE**: Average < 2.17s (improvement!) - ---- - -#### 3. Memory Testing - -**Leak detection** (valgrind): -```bash -valgrind --leak-check=full \ - --show-leak-kinds=all \ - --track-origins=yes \ - --log-file=valgrind.log \ - ../build/lua testes/all.lua - -# Check for leaks -grep "definitely lost" valgrind.log -grep "indirectly lost" valgrind.log -``` - -**Address sanitizer**: -```bash -cmake -B build_asan \ - -DCMAKE_BUILD_TYPE=Debug \ - -DLUA_CUSTOM_ALLOCATOR=ON \ - -DLUA_ENABLE_ASAN=ON -cmake --build build_asan -cd testes && ../build_asan/lua all.lua -``` - -**Undefined behavior sanitizer**: -```bash -cmake -B build_ubsan \ - -DCMAKE_BUILD_TYPE=Debug \ - -DLUA_CUSTOM_ALLOCATOR=ON \ - -DLUA_ENABLE_UBSAN=ON -cmake --build build_ubsan -cd testes && ../build_ubsan/lua all.lua -``` - ---- - -#### 4. Stress Testing - -**Fragmentation test**: -```lua --- fragmentation_test.lua --- Allocate and free in patterns that cause fragmentation - -local iterations = 10000 -local tables = {} - -for i = 1, iterations do - -- Allocate - tables[i] = {i, i*2, i*3} - - -- Free every other table (creates holes) - if i % 2 == 0 and tables[i-1] then - tables[i-1] = nil - collectgarbage("step") - end -end - --- Force full GC -collectgarbage("collect") - -print("Fragmentation test completed") -``` - -**Long-running test**: -```lua --- longevity_test.lua --- Run for extended period to detect slow leaks - -local start_mem = collectgarbage("count") -local iterations = 100000 - -for i = 1, iterations do - -- Create temporary objects - local t = {i, i*2, i*3} - local s = string.format("test_%d", i) - - -- Simulate work - for j = 1, 10 do - local x = i * j - end - - -- Periodic GC - if i % 1000 == 0 then - collectgarbage("collect") - local current_mem = collectgarbage("count") - local growth = current_mem - start_mem - print(string.format("Iteration %d: Memory growth = %.2f KB", i, growth)) - end -end - -local end_mem = collectgarbage("count") -local total_growth = end_mem - start_mem -print(string.format("Total memory growth: %.2f KB", total_growth)) - --- Should be minimal growth (< 100 KB) -assert(total_growth < 100, "Excessive memory growth detected") -``` - ---- - -## Performance Considerations - -### 1. Allocation Hotspots - -Based on profiling, Lua's allocation patterns: - -**Most frequent allocations**: -1. **Strings** (40-50% of allocations) - - Short strings (≤15 chars): Very frequent - - Long strings: Less frequent but larger -2. **Tables** (20-30%) - - Small tables (0-4 elements): Most common - - Large tables: Rare but expensive -3. **Closures** (10-15%) - - LClosure: Most common - - CClosure: Less common -4. **Proto** (5-10%) - - During compilation only -5. **CallInfo** (5-10%) - - Stack frames during execution - -**Optimization priorities**: -1. ⭐⭐⭐ Optimize small string allocation (< 32 bytes) -2. ⭐⭐⭐ Optimize small table allocation -3. ⭐⭐ Optimize closure allocation -4. ⭐ Optimize large allocations (fallback path) - ---- - -### 2. GC Impact - -**GC triggers** (from lgc.h:42): -```cpp -#define LUAI_GCPAUSE 250 // GC runs at 250% of previous memory -``` - -**Custom allocator impact**: -- Faster allocation → More objects created → More frequent GC -- If pools reduce fragmentation → Lower memory usage → Less frequent GC -- Net effect: **MUST BENCHMARK** to determine - -**GC tuning options**: -```lua --- Increase GC pause (less frequent GC) -collectgarbage("setpause", 300) -- Default 250 - --- Increase GC step multiplier (more work per step) -collectgarbage("setstepmul", 300) -- Default 200 -``` - ---- - -### 3. Cache Optimization - -**Alignment considerations**: -```cpp -// Cache line size: 64 bytes on most modern CPUs - -// BAD: Unaligned allocation -void* allocate(size_t size) { - return malloc(size); // May return unaligned pointer -} - -// GOOD: Align to cache line -void* allocate(size_t size) { - // Round up to 16-byte alignment (minimum for SSE) - size = (size + 15) & ~15; - - // For large allocations, align to cache line - if (size >= 256) { - size = (size + 63) & ~63; - } - - return aligned_alloc(16, size); -} -``` - -**Pool layout optimization**: -```cpp -// BAD: Pools spread across memory -PoolAllocator pool_32; // At address 0x1000 -PoolAllocator pool_64; // At address 0x8000 -PoolAllocator pool_128; // At address 0x10000 - -// GOOD: Pools in contiguous memory -struct alignas(64) TieredPools { - PoolAllocator<32, 2048> pool_32; // Cache-aligned - PoolAllocator<64, 1024> pool_64; // Adjacent - PoolAllocator<128, 512> pool_128; // Adjacent -} pools; -``` - ---- - -### 4. Lock Contention - -If allocator needs thread safety: - -```cpp -// BAD: Global lock for everything -std::mutex global_lock; - -void* allocate(size_t size) { - std::lock_guard lock(global_lock); // Contention! - return internal_allocate(size); -} - -// GOOD: Fine-grained locking -class TieredAllocator { - PoolAllocator<32> pool_32; - std::mutex lock_32; - - PoolAllocator<64> pool_64; - std::mutex lock_64; - - void* allocate(size_t size) { - if (size <= 32) { - std::lock_guard lock(lock_32); // Less contention - return pool_32.allocate(size); - } - // ... - } -}; - -// BETTER: Lock-free pools (if possible) -class LockFreePool { - std::atomic free_list; - - void* allocate() { - Block* block = free_list.load(std::memory_order_acquire); - while (block && !free_list.compare_exchange_weak( - block, block->next, std::memory_order_release, std::memory_order_acquire - )) { - // Retry on contention - } - return block; - } -}; -``` - -**Note**: Lua itself is not thread-safe, but allocator may be called from different Lua states. - ---- - -## Common Pitfalls & Best Practices - -### ❌ Pitfall 1: Ignoring GC Reentrancy - -**Problem**: -```cpp -static int allocation_depth = 0; // Global state - -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (++allocation_depth > 1) { - fprintf(stderr, "ERROR: Reentrant allocation detected!\n"); - abort(); - } - - void* result = malloc(nsize); - - --allocation_depth; - return result; -} -``` - -**Why it fails**: -- Emergency GC can trigger during allocation -- GC allocates new objects (e.g., finalizers) -- `allocation_depth > 1` → abort! - -**✅ Fix**: Make allocator reentrant -```cpp -// Use thread-local storage or accept reentrancy -thread_local int allocation_depth = 0; // OK for thread-local - -// Or simply accept that reentrancy happens -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - // No global state, fully reentrant - return malloc(nsize); -} -``` - ---- - -### ❌ Pitfall 2: Failing Deallocation - -**Problem**: -```cpp -void* pool_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - // Deallocation - if (!is_in_pool(ptr)) { - return nullptr; // ERROR: Must always succeed! - } - pool_free(ptr); - return nullptr; - } - // ... -} -``` - -**Why it fails**: -- Lua expects deallocation to ALWAYS succeed -- Returning nullptr on deallocation is an error -- Can cause memory leaks or corruption - -**✅ Fix**: Deallocation must always succeed -```cpp -void* pool_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - // Deallocation - must always succeed - if (is_in_pool(ptr)) { - pool_free(ptr); - } else { - // Fall back to system free - free(ptr); - } - return nullptr; // Always return nullptr for deallocation - } - // ... -} -``` - ---- - -### ❌ Pitfall 3: Modifying GC Debt - -**Problem**: -```cpp -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - MyAllocator* alloc = static_cast(ud); - - void* result = malloc(nsize); - - // Try to help GC by updating debt ourselves - alloc->lua_state->getGCDebtRef() -= nsize; // ERROR: Lua does this! - - return result; -} -``` - -**Why it fails**: -- Lua already updates GC debt in `luaM_malloc_` / `luaM_realloc_` / `luaM_free_` -- Updating it again causes double-counting -- GC will run at wrong times - -**✅ Fix**: Never touch GC debt in allocator -```cpp -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - // Just allocate/free - Lua handles GC debt - if (nsize == 0) { - free(ptr); - return nullptr; - } - return realloc(ptr, nsize); -} -``` - ---- - -### ❌ Pitfall 4: Incorrect Size Tracking - -**Problem**: -```cpp -std::unordered_map size_map; - -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - // Use tracked size instead of osize - size_t real_size = size_map[ptr]; // ERROR: osize is correct! - free(ptr); - size_map.erase(ptr); - return nullptr; - } - - void* result = malloc(nsize); - size_map[result] = nsize; - return result; -} -``` - -**Why it fails**: -- Lua always provides correct `osize` for deallocation -- Tracking sizes wastes memory and time -- Can get out of sync - -**✅ Fix**: Trust osize parameter -```cpp -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - // Just use osize - Lua tracks this correctly - free(ptr); - return nullptr; - } - return realloc(ptr, nsize); -} -``` - ---- - -### ❌ Pitfall 5: Not Handling NULL Returns - -**Problem**: -```cpp -// In Lua code -Table* t = luaH_new(L); // May return NULL if allocation fails! -t->set(L, key, value); // CRASH: Dereferencing NULL -``` - -**Why it fails**: -- Custom allocator may return NULL more often (e.g., pool exhausted) -- Lua code must handle allocation failures -- Most Lua code uses `luaM_saferealloc_` which throws on failure - -**✅ Fix**: Use safe allocation wrappers -```cpp -// For critical allocations, use safe version -Table* t = static_cast(luaM_malloc_(L, sizeof(Table), LUA_TTABLE)); -if (t == nullptr) { - // Emergency GC already tried, still failed - luaM_error(L); // Throws exception -} - -// Or use luaM_saferealloc_ which handles errors -Table* t = static_cast( - luaM_saferealloc_(L, nullptr, 0, sizeof(Table)) -); -// Never returns NULL - throws on failure -``` - ---- - -### ✅ Best Practice 1: Use Fallback Allocator - -```cpp -class SafeCustomAllocator { - lua_Alloc fallback; - void* fallback_ud; - - void* allocate(size_t size) { - // Try custom allocation - void* result = try_pool_allocate(size); - - // Fall back if pool exhausted - if (result == nullptr) { - result = fallback(fallback_ud, nullptr, 0, size); - } - - return result; - } -}; -``` - ---- - -### ✅ Best Practice 2: Align Allocations - -```cpp -void* my_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - free(ptr); - return nullptr; - } - - // Align to 16 bytes (good for SSE, cache, etc.) - size_t aligned_size = (nsize + 15) & ~15; - - return aligned_alloc(16, aligned_size); -} -``` - ---- - -### ✅ Best Practice 3: Profile Before Optimizing - -```cpp -// Don't guess - measure! -struct AllocatorStats { - size_t alloc_count[10]; // Histogram by size class - size_t realloc_count; - size_t free_count; - size_t total_bytes; - size_t peak_bytes; - - void dump() { - printf("Allocations by size class:\n"); - for (int i = 0; i < 10; i++) { - printf(" %d-%d bytes: %zu\n", - i * 32, (i+1) * 32, alloc_count[i]); - } - printf("Total reallocations: %zu\n", realloc_count); - printf("Total frees: %zu\n", free_count); - printf("Peak memory: %zu bytes\n", peak_bytes); - } -}; -``` - ---- - -### ✅ Best Practice 4: Test Thoroughly - -**Test checklist**: -- [ ] All testes/ pass -- [ ] Benchmark meets target (≤2.21s) -- [ ] No memory leaks (valgrind clean) -- [ ] No ASAN errors -- [ ] No UBSAN errors -- [ ] Stress tests pass -- [ ] Fragmentation acceptable -- [ ] Long-running tests pass (no slow leaks) -- [ ] Emergency GC works correctly -- [ ] Out-of-memory handling works - ---- - -## Example: Complete Tracking Allocator - -```cpp -// tracking_alloc.h -#ifndef tracking_alloc_h -#define tracking_alloc_h - -#include "lua.h" -#include -#include - -struct TrackingAllocator { - lua_Alloc base; - void* base_ud; - - // Statistics - size_t total_alloc; - size_t total_free; - size_t current_usage; - size_t peak_usage; - size_t alloc_count; - size_t free_count; - size_t realloc_count; - - // Size histogram - size_t size_histogram[10]; // 0-32, 32-64, ..., 288-320, 320+ - - // Optional: Track individual allocations - std::unordered_map allocations; - - TrackingAllocator(lua_Alloc base_alloc, void* ud) - : base(base_alloc), base_ud(ud), - total_alloc(0), total_free(0), current_usage(0), peak_usage(0), - alloc_count(0), free_count(0), realloc_count(0) { - for (int i = 0; i < 10; i++) size_histogram[i] = 0; - } - - void dump_stats() { - printf("=== Allocator Statistics ===\n"); - printf("Total allocated: %zu bytes (%zu calls)\n", total_alloc, alloc_count); - printf("Total freed: %zu bytes (%zu calls)\n", total_free, free_count); - printf("Total reallocations: %zu calls\n", realloc_count); - printf("Current usage: %zu bytes\n", current_usage); - printf("Peak usage: %zu bytes\n", peak_usage); - printf("\nSize distribution:\n"); - const char* labels[] = { - "0-32", "32-64", "64-96", "96-128", "128-160", - "160-192", "192-224", "224-256", "256-288", "288+" - }; - for (int i = 0; i < 10; i++) { - printf(" %s bytes: %zu allocations\n", labels[i], size_histogram[i]); - } - - if (!allocations.empty()) { - printf("\nLEAK DETECTED: %zu blocks not freed\n", allocations.size()); - size_t leaked = 0; - for (const auto& pair : allocations) { - leaked += pair.second; - } - printf("Total leaked: %zu bytes\n", leaked); - } - } -}; - -static void* tracking_alloc(void* ud, void* ptr, size_t osize, size_t nsize) { - auto* tracker = static_cast(ud); - - // Delegate to base allocator - void* result = tracker->base(tracker->base_ud, ptr, osize, nsize); - - // Update statistics - if (nsize == 0) { - // Deallocation - tracker->total_free += osize; - tracker->current_usage -= osize; - tracker->free_count++; - tracker->allocations.erase(ptr); - } else if (ptr == nullptr) { - // Allocation - if (result != nullptr) { - tracker->total_alloc += nsize; - tracker->current_usage += nsize; - tracker->alloc_count++; - - // Update histogram - size_t bucket = nsize / 32; - if (bucket >= 10) bucket = 9; - tracker->size_histogram[bucket]++; - - if (tracker->current_usage > tracker->peak_usage) { - tracker->peak_usage = tracker->current_usage; - } - - tracker->allocations[result] = nsize; - } - } else { - // Reallocation - tracker->realloc_count++; - if (result != nullptr) { - tracker->total_alloc += nsize; - tracker->total_free += osize; - tracker->current_usage += (nsize - osize); - - if (tracker->current_usage > tracker->peak_usage) { - tracker->peak_usage = tracker->current_usage; - } - - tracker->allocations.erase(ptr); - tracker->allocations[result] = nsize; - } - } - - return result; -} - -#endif -``` - -**Usage**: -```cpp -// In your application -#include "tracking_alloc.h" - -int main() { - // Create tracking allocator wrapping default allocator - TrackingAllocator tracker(l_alloc, nullptr); - - // Create Lua state with tracking - lua_State* L = lua_newstate(tracking_alloc, &tracker); - - // Run your Lua code - luaL_dofile(L, "test.lua"); - - // Dump statistics - tracker.dump_stats(); - - // Clean up - lua_close(L); - - return 0; -} -``` - ---- - -## Summary - -### Key Takeaways - -1. **Interface**: Implement `lua_Alloc` signature handling 3 operations -2. **GC Integration**: Never modify GC debt - Lua handles it -3. **Reentrancy**: Allocator WILL be called recursively during emergency GC -4. **Testing**: Validate with full test suite + benchmarks + sanitizers -5. **Performance**: Profile first, optimize second - target ≤2.21s -6. **Fallback**: Always have a fallback allocator for edge cases - -### Recommended Approach - -**Phase 1** (Week 1): Implement tracking allocator, profile workload -**Phase 2** (Week 2): Implement chosen strategy (pool/tiered/tagged) -**Phase 3** (Week 3): Integrate, test, benchmark -**Phase 4** (Week 4): Tune, document, deploy - -### Success Criteria - -✅ All tests pass (testes/all.lua) -✅ Performance ≤2.21s (≤1% regression) -✅ No memory leaks (valgrind clean) -✅ No sanitizer errors (ASAN/UBSAN) -✅ Code reviewed and documented - ---- - -**Next Steps**: Start with Phase 1 (Tracking & Profiling) to understand your specific allocation patterns, then choose the appropriate allocator strategy. diff --git a/docs/ENCAPSULATION_PLAN.md b/docs/ENCAPSULATION_PLAN.md deleted file mode 100644 index 9657f59f..00000000 --- a/docs/ENCAPSULATION_PLAN.md +++ /dev/null @@ -1,678 +0,0 @@ -# ✅ HISTORICAL - Encapsulation Continuation Plan (COMPLETED) - -**Status**: ✅ **COMPLETE** - All phases finished -**Completion Date**: November 2025 -**Result**: 19/19 classes fully encapsulated with private fields - ---- - -# Encapsulation Continuation Plan - -## IMPORTANT: Commit After Every Phase! ⚠️ - -After completing each encapsulation phase: -1. ✅ Build successfully -2. ✅ Run full test suite -3. ✅ **COMMIT immediately with descriptive message** -4. Move to next phase - -This ensures clean history and easy rollback if needed. - ---- - -## Current Status Summary - -**Fully Encapsulated (13/19 classes - 68%):** -1. ✅ LocVar - All fields private -2. ✅ AbsLineInfo - All fields private -3. ✅ Upvaldesc - All fields private -4. ✅ stringtable - All fields private -5. ✅ GCObject - Protected fields (base class) -6. ✅ TString - All fields private -7. ✅ Table - All fields private -8. ✅ Proto - All fields private (Phase 32) -9. ✅ UpVal - All fields private (Phase 33) -10. ✅ CClosure - All fields private (Phase 34) -11. ✅ LClosure - All fields private (Phase 34) -12. ✅ expdesc - All fields private (Phase 35) ⬅️ JUST COMPLETED -13. ✅ CallInfo - All fields private (Phase 36) ⬅️ JUST COMPLETED - -**Remaining Classes (6):** -- Udata (low risk) -- Udata0 (trivial) -- FuncState (medium risk) -- LexState (medium risk) -- global_State (high risk) -- lua_State (HIGHEST risk) - -**Current Performance**: 2.11s (test suite, 3% better than 2.17s baseline!) -**Performance Target**: ≤2.21s (strict requirement) - ---- - -## Phase 37: FuncState Encapsulation - -**Risk Level**: MEDIUM -**Estimated Time**: 2-3 hours -**Estimated Call Sites**: ~50-100 -**Files**: `src/compiler/lcode.cpp`, `src/compiler/lparser.cpp` - -Currently has 6 inline accessors, need to make all fields private and add comprehensive accessors. - ---- - -## Phase 38: LexState Encapsulation - -**Risk Level**: MEDIUM -**Estimated Time**: 2-3 hours -**Estimated Call Sites**: ~50-100 -**Files**: `src/compiler/llex.cpp`, `src/compiler/lparser.cpp` - -Currently has 4 inline accessors, need to make all fields private and add comprehensive accessors. - ---- - -## Phase 39: Udata Encapsulation - -**Risk Level**: LOW -**Estimated Time**: 1-2 hours -**Estimated Call Sites**: 10-20 - -### Current State -```cpp -class Udata : public GCBase { -public: // ← NEED TO MAKE PRIVATE - unsigned short nuvalue; - size_t len; - struct Table *metatable; - GCObject *gclist; - UValue uv[1]; -``` - -### Target State -```cpp -class Udata : public GCBase { -private: - unsigned short nuvalue; - size_t len; - Table *metatable; - GCObject *gclist; - UValue uv[1]; - -public: - // Existing accessors (keep) - size_t getLen() const noexcept { return len; } - unsigned short getNumUserValues() const noexcept { return nuvalue; } - Table* getMetatable() const noexcept { return metatable; } - void setMetatable(Table* mt) noexcept { metatable = mt; } - UValue* getUserValue(int idx) noexcept { return &uv[idx]; } - const UValue* getUserValue(int idx) const noexcept { return &uv[idx]; } - void* getMemory() noexcept; - const void* getMemory() const noexcept; - - // New accessors needed - void setLen(size_t l) noexcept { len = l; } - void setNumUserValues(unsigned short n) noexcept { nuvalue = n; } - - GCObject* getGclist() noexcept { return gclist; } - void setGclist(GCObject* gc) noexcept { gclist = gc; } - GCObject** getGclistPtr() noexcept { return &gclist; } - - // For initialization (luaS_newudata) - Table** getMetatablePtr() noexcept { return &metatable; } -}; -``` - -### Update Strategy -1. Add missing accessors to Udata class in lobject.h -2. Make fields private -3. Update call sites: - - `src/objects/lstring.cpp` (luaS_newudata, udata2finalize) - - `src/memory/lgc.cpp` (GC traversal) - - `src/core/lapi.cpp` (API functions) -4. Build, test, benchmark -5. Commit if successful - ---- - -## Phase 40: global_State Encapsulation - -**Risk Level**: HIGH -**Estimated Time**: 4-6 hours -**Estimated Call Sites**: 100+ - -### Current State -```cpp -class global_State { -public: // ← 46+ FIELDS ALL PUBLIC - lua_Alloc frealloc; - void *ud; - l_mem GCtotalbytes; - l_mem GCdebt; - // ... 42 more fields -``` - -### Target State -```cpp -class global_State { -private: // ← ALL FIELDS PRIVATE - lua_Alloc frealloc; - void *ud; - l_mem GCtotalbytes; - l_mem GCdebt; - stringtable strt; - TValue l_registry; - TValue nilvalue; - unsigned int seed; - lu_byte gcparams[LUA_GCPN]; - lu_byte currentwhite; - lu_byte gcstate; - lu_byte gckind; - lu_byte gcstopem; - lu_byte gcstp; - lu_byte gcemergency; - // GC object lists - GCObject *allgc; - GCObject **sweepgc; - GCObject *finobj; - GCObject *gray; - GCObject *grayagain; - GCObject *weak; - GCObject *ephemeron; - GCObject *allweak; - GCObject *tobefnz; - GCObject *fixedgc; - GCObject *survival; - GCObject *old1; - GCObject *reallyold; - GCObject *firstold1; - GCObject *finobjsur; - GCObject *finobjold1; - GCObject *finobjrold; - lua_State *twups; - lua_CFunction panic; - TString *memerrmsg; - TString *tmname[TM_N]; - Table *mt[LUA_NUMTYPES]; - TString *strcache[STRCACHE_N][STRCACHE_M]; - lua_WarnFunction warnf; - void *ud_warn; - LX mainth; - -public: - // Memory allocator - lua_Alloc getAllocator() const noexcept { return frealloc; } - void setAllocator(lua_Alloc f) noexcept { frealloc = f; } - void* getUserData() const noexcept { return ud; } - void setUserData(void* data) noexcept { ud = data; } - - // GC memory counters (need reference accessors) - l_mem getTotalBytes() const noexcept { return GCtotalbytes; } - void setTotalBytes(l_mem bytes) noexcept { GCtotalbytes = bytes; } - l_mem& getTotalBytesRef() noexcept { return GCtotalbytes; } - - l_mem getDebt() const noexcept { return GCdebt; } - void setDebt(l_mem debt) noexcept { GCdebt = debt; } - l_mem& getDebtRef() noexcept { return GCdebt; } - - l_mem getMarked() const noexcept { return GCmarked; } - void setMarked(l_mem m) noexcept { GCmarked = m; } - l_mem& getMarkedRef() noexcept { return GCmarked; } - - l_mem getMajorMinor() const noexcept { return GCmajorminor; } - void setMajorMinor(l_mem mm) noexcept { GCmajorminor = mm; } - l_mem& getMajorMinorRef() noexcept { return GCmajorminor; } - - // String table - stringtable* getStringTable() noexcept { return &strt; } - const stringtable* getStringTable() const noexcept { return &strt; } - - // Registry and special values - TValue* getRegistry() noexcept { return &l_registry; } - const TValue* getRegistry() const noexcept { return &l_registry; } - TValue* getNilValue() noexcept { return &nilvalue; } - const TValue* getNilValue() const noexcept { return &nilvalue; } - - // Random seed - unsigned int getSeed() const noexcept { return seed; } - void setSeed(unsigned int s) noexcept { seed = s; } - unsigned int& getSeedRef() noexcept { return seed; } - - // GC parameters - lu_byte getGCParam(int idx) const noexcept { return gcparams[idx]; } - void setGCParam(int idx, lu_byte val) noexcept { gcparams[idx] = val; } - lu_byte* getGCParams() noexcept { return gcparams; } - - // GC state - lu_byte getCurrentWhite() const noexcept { return currentwhite; } - void setCurrentWhite(lu_byte w) noexcept { currentwhite = w; } - lu_byte& getCurrentWhiteRef() noexcept { return currentwhite; } - - lu_byte getGCState() const noexcept { return gcstate; } - void setGCState(lu_byte state) noexcept { gcstate = state; } - lu_byte& getGCStateRef() noexcept { return gcstate; } - - lu_byte getGCKind() const noexcept { return gckind; } - void setGCKind(lu_byte kind) noexcept { gckind = kind; } - - lu_byte getGCStopEM() const noexcept { return gcstopem; } - void setGCStopEM(lu_byte stop) noexcept { gcstopem = stop; } - lu_byte& getGCStopEMRef() noexcept { return gcstopem; } - - lu_byte getGCStp() const noexcept { return gcstp; } - void setGCStp(lu_byte stp) noexcept { gcstp = stp; } - - lu_byte getGCEmergency() const noexcept { return gcemergency; } - void setGCEmergency(lu_byte em) noexcept { gcemergency = em; } - - // GC object lists (need both value and pointer accessors) - GCObject* getAllGC() const noexcept { return allgc; } - void setAllGC(GCObject* gc) noexcept { allgc = gc; } - GCObject** getAllGCPtr() noexcept { return &allgc; } - - GCObject** getSweepGC() const noexcept { return sweepgc; } - void setSweepGC(GCObject** sweep) noexcept { sweepgc = sweep; } - GCObject*** getSweepGCPtr() noexcept { return &sweepgc; } - - GCObject* getFinObj() const noexcept { return finobj; } - void setFinObj(GCObject* gc) noexcept { finobj = gc; } - GCObject** getFinObjPtr() noexcept { return &finobj; } - - GCObject* getGray() const noexcept { return gray; } - void setGray(GCObject* gc) noexcept { gray = gc; } - GCObject** getGrayPtr() noexcept { return &gray; } - - GCObject* getGrayAgain() const noexcept { return grayagain; } - void setGrayAgain(GCObject* gc) noexcept { grayagain = gc; } - GCObject** getGrayAgainPtr() noexcept { return &grayagain; } - - GCObject* getWeak() const noexcept { return weak; } - void setWeak(GCObject* gc) noexcept { weak = gc; } - GCObject** getWeakPtr() noexcept { return &weak; } - - GCObject* getEphemeron() const noexcept { return ephemeron; } - void setEphemeron(GCObject* gc) noexcept { ephemeron = gc; } - GCObject** getEphemeronPtr() noexcept { return &ephemeron; } - - GCObject* getAllWeak() const noexcept { return allweak; } - void setAllWeak(GCObject* gc) noexcept { allweak = gc; } - GCObject** getAllWeakPtr() noexcept { return &allweak; } - - GCObject* getToBeFnz() const noexcept { return tobefnz; } - void setToBeFnz(GCObject* gc) noexcept { tobefnz = gc; } - GCObject** getToBeFnzPtr() noexcept { return &tobefnz; } - - GCObject* getFixedGC() const noexcept { return fixedgc; } - void setFixedGC(GCObject* gc) noexcept { fixedgc = gc; } - GCObject** getFixedGCPtr() noexcept { return &fixedgc; } - - // Generational GC lists - GCObject* getSurvival() const noexcept { return survival; } - void setSurvival(GCObject* gc) noexcept { survival = gc; } - GCObject** getSurvivalPtr() noexcept { return &survival; } - - GCObject* getOld1() const noexcept { return old1; } - void setOld1(GCObject* gc) noexcept { old1 = gc; } - GCObject** getOld1Ptr() noexcept { return &old1; } - - GCObject* getReallyOld() const noexcept { return reallyold; } - void setReallyOld(GCObject* gc) noexcept { reallyold = gc; } - GCObject** getReallyOldPtr() noexcept { return &reallyold; } - - GCObject* getFirstOld1() const noexcept { return firstold1; } - void setFirstOld1(GCObject* gc) noexcept { firstold1 = gc; } - GCObject** getFirstOld1Ptr() noexcept { return &firstold1; } - - GCObject* getFinObjSur() const noexcept { return finobjsur; } - void setFinObjSur(GCObject* gc) noexcept { finobjsur = gc; } - GCObject** getFinObjSurPtr() noexcept { return &finobjsur; } - - GCObject* getFinObjOld1() const noexcept { return finobjold1; } - void setFinObjOld1(GCObject* gc) noexcept { finobjold1 = gc; } - GCObject** getFinObjOld1Ptr() noexcept { return &finobjold1; } - - GCObject* getFinObjROld() const noexcept { return finobjrold; } - void setFinObjROld(GCObject* gc) noexcept { finobjrold = gc; } - GCObject** getFinObjROldPtr() noexcept { return &finobjrold; } - - // Thread list - lua_State* getTwups() const noexcept { return twups; } - void setTwups(lua_State* th) noexcept { twups = th; } - lua_State** getTwupsPtr() noexcept { return &twups; } - - // Panic handler - lua_CFunction getPanic() const noexcept { return panic; } - void setPanic(lua_CFunction p) noexcept { panic = p; } - - // Memory error message - TString* getMemErrMsg() const noexcept { return memerrmsg; } - void setMemErrMsg(TString* msg) noexcept { memerrmsg = msg; } - TString** getMemErrMsgPtr() noexcept { return &memerrmsg; } - - // Tag method names - TString* getTMName(int tm) const noexcept { return tmname[tm]; } - void setTMName(int tm, TString* name) noexcept { tmname[tm] = name; } - TString** getTMNamePtr(int tm) noexcept { return &tmname[tm]; } - - // Metatables - Table* getMetatable(int type) const noexcept { return mt[type]; } - void setMetatable(int type, Table* t) noexcept { mt[type] = t; } - Table** getMetatablePtr(int type) noexcept { return &mt[type]; } - - // String cache - TString* getStrCache(int n, int m) const noexcept { return strcache[n][m]; } - void setStrCache(int n, int m, TString* s) noexcept { strcache[n][m] = s; } - TString** getStrCachePtr(int n, int m) noexcept { return &strcache[n][m]; } - - // Warning function - lua_WarnFunction getWarnF() const noexcept { return warnf; } - void setWarnF(lua_WarnFunction w) noexcept { warnf = w; } - void* getUDWarn() const noexcept { return ud_warn; } - void setUDWarn(void* ud) noexcept { ud_warn = ud; } - - // Main thread - LX* getMainThread() noexcept { return &mainth; } - const LX* getMainThread() const noexcept { return &mainth; } -}; -``` - -### Update Strategy (Batched) -1. Add all ~100 accessors to global_State class -2. Make fields private -3. Update call sites in batches: - - **Batch 1**: `src/memory/lgc.cpp` (GC list manipulation - most accesses) - - **Batch 2**: `src/core/lstate.cpp` (initialization/cleanup) - - **Batch 3**: `src/objects/lstring.cpp` (string table access) - - **Batch 4**: `src/core/lapi.cpp` (API functions) - - **Batch 5**: Remaining files -4. Build and test after EACH batch -5. Final benchmark after all batches -6. Commit if performance ≤2.21s - -**Critical**: Use pointer accessors (e.g., `getAllGCPtr()`) in GC code to avoid copies. - ---- - -## Phase 41: lua_State Encapsulation - -**Risk Level**: EXTREME ⚠️ -**Estimated Time**: 1 week -**Estimated Call Sites**: 200-300+ - -### Current State -```cpp -class lua_State : public GCBase { -public: // ← 27 FIELDS ALL PUBLIC (MOST CRITICAL CLASS) - lu_byte allowhook; - TStatus status; - StkIdRel top; - global_State *l_G; - CallInfo *ci; - // ... 22 more fields -``` - -### Target State -```cpp -class lua_State : public GCBase { -private: // ← ALL FIELDS PRIVATE - lu_byte allowhook; - TStatus status; - StkIdRel top; - global_State *l_G; - CallInfo *ci; - StkIdRel stack_last; - StkIdRel stack; - UpVal *openupval; - StkIdRel tbclist; - GCObject *gclist; - lua_State *twups; - lua_longjmp *errorJmp; - CallInfo base_ci; - volatile lua_Hook hook; - ptrdiff_t errfunc; - l_uint32 nCcalls; - int oldpc; - int nci; - int basehookcount; - int hookcount; - volatile l_signalT hookmask; - struct { - int ftransfer; - int ntransfer; - } transferinfo; - -public: - // Keep existing 3 accessors - global_State* getGlobalState() const noexcept { return l_G; } - CallInfo* getCallInfo() const noexcept { return ci; } - TStatus getStatus() const noexcept { return status; } - - // Stack accessors (reference for hot paths) - StkIdRel& topRef() noexcept { return top; } - const StkIdRel& topRef() const noexcept { return top; } - - StkIdRel& stackRef() noexcept { return stack; } - const StkIdRel& stackRef() const noexcept { return stack; } - - StkIdRel& stackLastRef() noexcept { return stack_last; } - const StkIdRel& stackLastRef() const noexcept { return stack_last; } - - // CallInfo (hot path) - CallInfo*& ciRef() noexcept { return ci; } - CallInfo* const& ciRef() const noexcept { return ci; } - void setCallInfo(CallInfo* newci) noexcept { ci = newci; } - - // Allow hook (hot path) - lu_byte getAllowHook() const noexcept { return allowhook; } - void setAllowHook(lu_byte ah) noexcept { allowhook = ah; } - lu_byte& allowHookRef() noexcept { return allowhook; } - - // Status - void setStatus(TStatus st) noexcept { status = st; } - - // Open upvalues - UpVal* getOpenUpval() const noexcept { return openupval; } - void setOpenUpval(UpVal* uv) noexcept { openupval = uv; } - UpVal** getOpenUpvalPtr() noexcept { return &openupval; } - - // TBC list - StkIdRel& tbclistRef() noexcept { return tbclist; } - const StkIdRel& tbclistRef() const noexcept { return tbclist; } - - // GC list - GCObject* getGclist() const noexcept { return gclist; } - void setGclist(GCObject* gc) noexcept { gclist = gc; } - GCObject** getGclistPtr() noexcept { return &gclist; } - - // Thread list - lua_State* getTwups() const noexcept { return twups; } - void setTwups(lua_State* th) noexcept { twups = th; } - lua_State** getTwupsPtr() noexcept { return &twups; } - - // Error jump - lua_longjmp* getErrorJmp() const noexcept { return errorJmp; } - void setErrorJmp(lua_longjmp* ej) noexcept { errorJmp = ej; } - lua_longjmp** getErrorJmpPtr() noexcept { return &errorJmp; } - - // Base call info - CallInfo* getBaseCI() noexcept { return &base_ci; } - const CallInfo* getBaseCI() const noexcept { return &base_ci; } - - // Hook - lua_Hook getHook() const noexcept { return hook; } - void setHook(lua_Hook h) noexcept { hook = h; } - volatile lua_Hook& getHookRef() noexcept { return hook; } - - // Error function - ptrdiff_t getErrFunc() const noexcept { return errfunc; } - void setErrFunc(ptrdiff_t ef) noexcept { errfunc = ef; } - ptrdiff_t& getErrFuncRef() noexcept { return errfunc; } - - // nCcalls (hot path - may need reference) - l_uint32 getNCcalls() const noexcept { return nCcalls; } - void setNCcalls(l_uint32 n) noexcept { nCcalls = n; } - l_uint32& nCcallsRef() noexcept { return nCcalls; } - - // Old PC - int getOldPC() const noexcept { return oldpc; } - void setOldPC(int pc) noexcept { oldpc = pc; } - int& oldPCRef() noexcept { return oldpc; } - - // NCI (call info count) - int getNCI() const noexcept { return nci; } - void setNCI(int n) noexcept { nci = n; } - int& nciRef() noexcept { return nci; } - - // Hook counts - int getBaseHookCount() const noexcept { return basehookcount; } - void setBaseHookCount(int c) noexcept { basehookcount = c; } - int& baseHookCountRef() noexcept { return basehookcount; } - - int getHookCount() const noexcept { return hookcount; } - void setHookCount(int c) noexcept { hookcount = c; } - int& hookCountRef() noexcept { return hookcount; } - - // Hook mask - l_signalT getHookMask() const noexcept { return hookmask; } - void setHookMask(l_signalT m) noexcept { hookmask = m; } - volatile l_signalT& getHookMaskRef() noexcept { return hookmask; } - - // Transfer info - int getFTransfer() const noexcept { return transferinfo.ftransfer; } - void setFTransfer(int ft) noexcept { transferinfo.ftransfer = ft; } - int& fTransferRef() noexcept { return transferinfo.ftransfer; } - - int getNTransfer() const noexcept { return transferinfo.ntransfer; } - void setNTransfer(int nt) noexcept { transferinfo.ntransfer = nt; } - int& nTransferRef() noexcept { return transferinfo.ntransfer; } - - // Keep all existing methods (30+) -}; -``` - -### Update Strategy (ULTRA CONSERVATIVE) - -**CRITICAL**: Must benchmark after EVERY small batch (10-20 call sites) - -1. Add ~40 accessors to lua_State class -2. Make fields private -3. Update call sites in VERY SMALL batches: - - **Phase 37a**: Non-hot paths (50-70 call sites) - - `src/core/lapi.cpp` (non-critical API functions) - - `src/libraries/*.cpp` (standard libraries) - - Build, test, **benchmark** ✓ - - **Phase 37b**: Medium-hot paths (50-70 call sites) - - `src/core/lapi.cpp` (critical API functions) - - `src/auxiliary/lauxlib.cpp` - - Build, test, **benchmark** ✓ - - **Phase 37c**: Hot path - ldo.cpp (50-70 call sites) - - `src/core/ldo.cpp` (call/return/error handling) - - **BENCHMARK AFTER EVERY 10-20 CHANGES** ⚠️ - - Build, test, **benchmark** ✓ - - **Phase 37d**: ULTRA HOT - lvm.cpp (30-50 call sites) - - `src/vm/lvm.cpp` (VM interpreter loop) - - **THIS IS THE MOST CRITICAL FILE** - - **BENCHMARK AFTER EVERY 5-10 CHANGES** ⚠️⚠️⚠️ - - Use reference accessors exclusively - - Zero-cost abstraction is MANDATORY - - Build, test, **benchmark** ✓ - -4. **If ANY batch shows regression > 2.21s: REVERT IMMEDIATELY** - -5. Final comprehensive benchmark (10 runs minimum) - -6. Commit only if performance ≤2.21s - ---- - -## Success Criteria - -**Phase 37 (FuncState):** -- ✅ All fields private -- ✅ Tests pass -- ✅ Performance ≤2.21s -- ✅ Commit immediately - -**Phase 38 (LexState):** -- ✅ All fields private -- ✅ Tests pass -- ✅ Performance ≤2.21s -- ✅ Commit immediately - -**Phase 39 (Udata):** -- ✅ All fields private -- ✅ Tests pass -- ✅ Performance ≤2.21s -- ✅ Commit immediately - -**Phase 40 (global_State):** -- ✅ All 46+ fields private -- ✅ Tests pass -- ✅ Performance ≤2.21s -- ✅ Commit immediately - -**Phase 41 (lua_State):** -- ✅ All 27 fields private -- ✅ Tests pass -- ✅ Performance ≤2.21s -- ✅ **100% ENCAPSULATION COMPLETE** 🎉 -- ✅ Commit immediately - ---- - -## Performance Monitoring - -**Current**: 2.14s (3% better than 2.17s baseline) -**Target**: ≤2.21s (≤1% regression from baseline) - -**Benchmark command:** -```bash -cd /home/peter/claude/lua/testes -for i in 1 2 3 4 5; do ../build/lua all.lua 2>&1 | grep "total time:"; done -``` - -**Frequency:** -- Udata: After completion -- Parser classes: After each class -- global_State: After each batch -- lua_State: After EVERY 5-20 call sites (depending on hotness) - ---- - -## Risks and Mitigations - -**Risk 1**: Performance regression in VM hot paths -- **Mitigation**: Use reference accessors, inline everything, benchmark frequently - -**Risk 2**: Many call sites to update (300+) -- **Mitigation**: Batch updates, test after each batch, ready to revert - -**Risk 3**: Complex field access patterns (pointer-to-pointer) -- **Mitigation**: Provide both value and pointer accessors - -**Risk 4**: Union field access in CallInfo, UpVal -- **Mitigation**: Already handled, use existing pattern - ---- - -## Completion Timeline - -- **Phase 37 (FuncState)**: 2-3 hours -- **Phase 38 (LexState)**: 2-3 hours -- **Phase 39 (Udata)**: 1-2 hours -- **Phase 40 (global_State)**: 4-6 hours -- **Phase 41 (lua_State)**: 1 week (very careful, incremental) - -**Total Estimated**: 2-3 weeks for complete encapsulation - -**Current Progress**: 68% (13/19 classes) -**Remaining**: 32% (6 classes) - ---- - -**Last Updated**: After Phase 36 (CallInfo encapsulation) -**Next Step**: Phase 37 (FuncState encapsulation) diff --git a/docs/HISTORY.md b/docs/HISTORY.md deleted file mode 100644 index 05948d90..00000000 --- a/docs/HISTORY.md +++ /dev/null @@ -1,610 +0,0 @@ -# Lua C++ Conversion Project - Phase History - -**Last Updated**: 2025-11-21 -**Status**: Archive of completed phases - ---- - -## Overview - -This document archives the detailed history of all 119 phases completed in the Lua C++ conversion project. For current status and next steps, see [CLAUDE.md](../CLAUDE.md). - ---- - -## Phase Summary by Era - -### Era 1: Foundation (Phases 1-50) -- Struct → class conversions -- Initial encapsulation -- Constructor initialization -- Basic CRTP setup - -### Era 2: Encapsulation (Phases 37-89) -- Complete private field migration -- Accessor method creation -- Method conversion from free functions - -### Era 3: SRP Refactoring (Phases 90-92) -- **Phase 90**: FuncState (16 fields → 5 subsystems) -- **Phase 91**: global_State (46+ fields → 7 subsystems) -- **Phase 92**: Proto (19 fields → 2 logical groups) -- **Result**: 6% performance improvement - -### Era 4: LuaStack Centralization (Phase 94) -- Complete stack encapsulation -- 96 conversion sites across 15+ files -- All stack operations through LuaStack class -- **Result**: Zero performance regression - -### Era 5: Enum Class Modernization (Phases 96-100) -- BinOpr, UnOpr enum classes -- F2Imod, OpMode, TMS, RESERVED -- Type-safe operator handling - -### Era 6: GC Modularization (Phase 101) -- 6 focused GC modules extracted -- lgc.cpp: 1,950 lines → 936 lines (52% reduction) -- CI/CD infrastructure setup -- **Result**: 40% code organization improvement - -### Era 7: Cast Modernization (Phases 102-111) -- **Phase 102-103**: Numeric and pointer casts (23 instances) -- **Phase 107-110**: Eliminated 14+ `const_cast` uses -- **Phase 111**: Replaced 48 `cast()` macro instances -- **Result**: 100% modern C++ casts - -### Era 8: Type Safety (Phases 112-119) -- **Phase 112**: std::span accessors, operator type safety -- **Phase 113**: Boolean return types (7 functions) -- **Phase 114**: NULL → nullptr codebase-wide -- **Phase 115**: std::span adoption (partial, 60+ sites) -- **Phase 116**: Dyndata span + UB fixes -- **Phase 117**: More boolean conversions (5 functions) -- **Phase 118**: [[nodiscard]] + safety hardening -- **Phase 119**: std::array conversion (4 arrays) - ---- - -## Detailed Phase Breakdown - -### Phase 1-2: Constructor Initialization -**Date**: Nov 16, 2025 -**Performance**: 4.20s avg (new baseline) - -#### Phase 1: CallInfo Constructor -- Fixed CRITICAL BUG: 5/9 fields uninitialized (undefined behavior) -- Added CallInfo() noexcept constructor -- Updated luaE_extendCI to use placement new -- Zero warnings, all tests passing - -#### Phase 2: lua_State init() Method -- Added init(global_State*) method -- Consolidated initialization (27+ fields) -- Uses placement new for base_ci -- Simplified preinit_thread() implementation - ---- - -### Phase 90-92: SRP Refactoring -**Date**: Nov 15, 2025 -**Performance**: 2.04-2.18s avg (historical baseline 2.17s) - -#### Phase 90: FuncState SRP -- 16 fields → 5 subsystems -- CodeBuffer, ConstantPool, VariableScope, RegisterAllocator, UpvalueTracker -- Performance: 2.04s avg (6% faster!) -- Net: +238 insertions, -84 deletions - -#### Phase 91: global_State SRP -- 46+ fields → 7 subsystems -- MemoryAllocator, GCAccounting, GCParameters, GCObjectLists, StringCache, TypeSystem, RuntimeServices -- Performance: 2.18s avg (baseline maintained) -- Net: +409 insertions, -181 deletions - -#### Phase 92: Proto SRP -- 19 fields → 2 logical groups -- Runtime data + ProtoDebugInfo subsystem -- Performance: 2.01s avg (8% faster!) -- Net: +149 insertions, -85 deletions - -**Total Impact**: Dramatically improved code organization, zero performance regression (actually faster!) - ---- - -### Phase 94: LuaStack Aggressive Centralization -**Date**: Nov 17, 2025 -**Performance**: 4.41s avg - -**MAJOR ACHIEVEMENT**: All stack operations now centralized! - -#### Subphases -- 94.1: Added complete LuaStack method suite (25+ methods) -- 94.2: Converted lapi.cpp (~40 sites) -- 94.3: Converted API macros to inline functions -- 94.4: Converted stack checking operations -- 94.5: Converted stack assignments -- 94.6.1-94.6.3: Converted all direct pointer operations (96 sites) - - lapi.cpp, ldo.cpp, lundump, ldump, lobject, parseutils, parser - - lvm_table, lvm_string, ltable, lfunc, llex - - lstate, lgc, ltm, ldebug - - **lvm.cpp (VM hot path)** - 22 critical conversions -- 94.7: Removed deprecated code -- 94.8: Documentation complete - -#### Key Methods -- `push()`, `pop()`, `popN()`, `adjust()` - Basic stack manipulation -- `setTopPtr()`, `setTopOffset()` - Top pointer management -- `indexToValue()`, `indexToStack()` - API index conversion -- `ensureSpace()`, `ensureSpaceP()` - Stack growth with pointer preservation -- `setSlot()`, `copySlot()`, `setNil()` - GC-aware assignments -- `save()`, `restore()` - Pointer/offset conversion for reallocation -- `grow()`, `shrink()`, `realloc()` - Stack memory management - -#### Architecture -- Single Responsibility - LuaStack owns ALL stack operations -- Full encapsulation - All stack fields private -- Inline methods - Zero function call overhead -- Type safety - Strong boundaries between subsystems - -**Total Impact**: Complete stack encapsulation, improved maintainability, zero performance regression! - ---- - -### Phase 96-100: Enum Class Modernization -**Date**: Nov 2025 - -#### Phase 96: BinOpr enum class -- Converted binary operator enum to type-safe enum class -- Eliminated magic numbers in operator handling - -#### Phase 97: UnOpr enum class -- Converted unary operator enum to type-safe enum class - -#### Phase 98-100: Additional enum classes -- F2Imod (float-to-int rounding modes) -- OpMode (instruction format modes) -- TMS (tag methods/metamethods) -- RESERVED (reserved keyword tokens) - -**Total Impact**: Improved type safety, better error messages, modern C++ idioms! - ---- - -### Phase 101: GC Modularization & CI/CD -**Date**: Nov 2025 - -**MAJOR ACHIEVEMENT**: Garbage collector fully modularized! - -#### GC Modules Extracted -- `gc_core.cpp/h` - Core GC utilities (132 lines) -- `gc_marking.cpp/h` - Marking phase implementation (429 lines) -- `gc_sweeping.cpp/h` - Sweeping and object freeing (264 lines) -- `gc_finalizer.cpp/h` - Finalization queue management (223 lines) -- `gc_weak.cpp/h` - Ephemeron and weak table handling (345 lines) -- `gc_collector.cpp/h` - GC orchestration and control (348 lines) - -**lgc.cpp reduced**: 1,950 lines → 936 lines (52% reduction!) - -#### CI/CD Infrastructure -- **GitHub Actions workflows** - - Multi-compiler testing (GCC 13, Clang 15) - - Debug and Release configurations - - Sanitizer builds (ASAN + UBSAN) - - Performance regression detection (5.00s threshold) - -- **Code coverage reporting** - - lcov/gcov integration - - HTML coverage reports - - **96.1% line coverage** achieved! - -- **Static analysis** - - cppcheck integration - - clang-tidy checks - - include-what-you-use analysis - -**Total Impact**: 40% code organization improvement, automated quality assurance! - ---- - -### Phase 102-111: Cast Modernization & Const-Correctness -**Date**: Nov 2025 - -#### Phase 102: Numeric cast modernization -- Replaced 11 C-style numeric casts with `static_cast` -- Improved type safety and intent clarity - -#### Phase 103: Pointer cast modernization -- Modernized 12 pointer casts in Table operations -- Used appropriate `static_cast` and `reinterpret_cast` - -#### Phase 107: Const-correctness improvements -- Eliminated 7 `const_cast` uses through proper design -- Used `mutable` for cache fields and internal state - -#### Phase 108: Table::pset API refinement -- Eliminated 3 `const_cast` uses in Table operations -- Cleaner API design with proper const-correctness - -#### Phase 109: NodeArray helper class -- Encapsulated Limbox allocation pattern -- Improved type safety for internal Table structures - -#### Phase 110: Additional const-correctness -- Eliminated 4 more `const_cast` uses with `mutable` -- Proper handling of lazily-computed values - -#### Phase 111: cast() macro elimination -- Replaced 48 instances of `cast()` macro with proper C++ casts -- Final step in complete cast modernization -- All casts now use `static_cast`, `reinterpret_cast`, or `const_cast` appropriately - -**Total Impact**: Complete cast modernization, eliminated 14+ `const_cast` uses, improved const-correctness throughout codebase! - ---- - -### Phase 112: Type Safety & std::span -**Date**: Nov 2025 -**Performance**: 4.33s avg (exactly at target!) 🎯 - -**Multi-part phase with three major improvements:** - -#### Part 0: std::span Accessors to Proto -- Added std::span accessors to Proto and ProtoDebugInfo -- `getCodeSpan()`, `getConstantsSpan()`, `getProtosSpan()`, `getUpvaluesSpan()` -- Debug info span accessors (lineinfo, abslineinfo, locvars) -- Zero-cost abstraction with inline constexpr methods - -#### Part 0.1: Clang Compatibility Fix -- Fixed Clang 15+ sign-conversion errors in span accessors -- Ensured multi-compiler compatibility - -#### Part 1: Operator Type Safety -- Converted `FuncState::prefix/infix/posfix` to use `UnOpr`/`BinOpr` enum classes directly -- Eliminated 6 redundant static_cast operations -- Files: `lparser.h`, `lcode.cpp`, `parser.cpp` - -#### Part 2: InstructionView Encapsulation -- Added opcode property methods: `getOpMode()`, `testAMode()`, `testTMode()`, etc. -- Encapsulated `luaP_opmodes` array access -- Files: `lopcodes.h`, `lopcodes.cpp`, `lcode.cpp`, `ldebug.cpp` - -**Total Impact**: -- std::span integration begun (Proto arrays now have span accessors) -- Type safety: Operators use enum classes directly (no int roundtrips) -- InstructionView: Better encapsulation of VM internals - ---- - -### Phase 113: Boolean Predicates & Loop Modernization -**Date**: Nov 2025 -**Performance**: 4.73s avg - -#### Part A: Loop Modernization -- Modernized loops with C++ standard algorithms -- Range-based for loops where appropriate - -#### Part B: Boolean Return Types (7 functions) -Converted internal predicates from int to bool: - -**Compiler predicates** (lcode.cpp): -- `isKint()` - checks if expression is literal integer -- `isCint()` - checks if integer fits in register C -- `isSCint()` - checks if integer fits in register sC -- `isSCnumber()` - checks if number fits in register -- `validop()` - validates constant folding operation - -**Test-only predicates** (ltests.cpp): -- `testobjref1()` - tests GC object reference invariants -- `testobjref()` - wrapper that prints failed invariants - -**Impact**: Clearer intent, prevents arithmetic on booleans - ---- - -### Phase 114: NULL to nullptr Modernization -**Date**: Nov 2025 -**Performance**: Zero impact - -- Replaced all C-style `NULL` macros with C++11 `nullptr` -- Improved type safety (nullptr has its own type) -- Modern C++ best practice -- Codebase-wide systematic replacement - ---- - -### Phase 115: std::span Adoption (Partial) -**Date**: Nov 21, 2025 -**Performance**: 4.70s avg (regression noted) - -**Multi-part phase with performance concerns:** - -#### Phase 115.1: String Operations -- 7 files modified, 40+ sites converted -- Dual-API pattern: pointer-primary for performance -- Commits: 0aa81ee, 08c8774 - -#### Phase 115.2: Proto Span Accessors -- 2 files modified, 23 sites converted -- ldebug.cpp: 8 conversions -- lundump.cpp: 15 conversions -- Commits: 6f830e7, 943a3ef - -#### Phase 115.3: Table::getArraySpan() -- Status: DEFERRED due to performance concerns -- Minimal implementation added -- Full adoption postponed - -#### Phase 115.4: Undefined Behavior Analysis -- Comprehensive UB audit -- Documentation created -- Critical issues identified and fixed - -**Performance Analysis**: -- Current: 4.70s avg (range: 4.56s-4.87s) -- Target: ≤4.33s -- Regression: 11.9% above baseline -- Status: ⚠️ Above target, needs investigation - -**Benefits Achieved**: -- ✅ Type safety: Size in span type, bounds checking in debug -- ✅ Modern C++: Range-based for loops (13 sites) -- ✅ Maintainability: Reduced pointer arithmetic (23 sites) -- ✅ C API compatibility: Dual-API pattern maintains ABI -- ✅ All tests passing - -**Lessons Learned**: -- "Zero-cost" abstractions can have measurable costs -- Performance measurement after each phase is critical -- Dual-API pattern (span + pointer) works for C compatibility -- Phase 115.2 unexpectedly added 3.7% overhead - ---- - -### Phase 116: Dyndata Span + UB Fixes -**Date**: Nov 21, 2025 -**Performance**: 4.18s avg ✅ - -#### std::span Integration -- Added Dyndata::actvarGetSpan() methods (const and non-const overloads) -- Returns std::span for the actvar array -- Complements existing pointer-based accessors - -#### Context -- Phase 112 already added Proto span accessors -- Phase 115.1 added std::span to buffer/string operations -- Phase 115.3 added Table::getArraySpan() -- Phase 116 completes span integration for compiler data structures - -#### Critical UB Fixes -Multiple undefined behavior bugs fixed (see Phase 116 commit for details) - -**Benefits**: -- Zero-cost abstraction -- Better type safety (no raw pointer arithmetic) -- Enables range-based algorithms -- Modern C++23 idioms - ---- - -### Phase 117: Enhanced Type Safety - Bool Conversions -**Date**: Nov 21, 2025 -**Performance**: 4.60s avg - -**Converted 5 internal predicates from int to bool:** - -#### Table Operations (ltable.cpp) -- `equalkey()` - Table key equality comparison -- `hashkeyisempty()` - Hash key emptiness check - -#### String Pattern Matching (lstrlib.cpp) -- `match_class()` - Pattern character class matching -- `matchbracketclass()` - Bracket class matching -- `singlematch()` - Single character pattern matching - -**Total Bool Conversions**: 12 functions -- Phase 113: 7 functions -- Phase 117: 5 functions - -**Benefits**: -- Clearer intent (predicates return bool, not int) -- Prevents accidental arithmetic on boolean results -- Modern C++ best practices -- Better compiler optimization opportunities - -**Performance Notes**: -- Average: 4.60s (2 x 5-run benchmarks) -- Target: ≤4.33s -- Status: ⚠️ Slightly above target (~6% from 4.20s baseline) -- Note: High variance observed (4.31s-5.03s range) - - Some individual runs within target (best: 4.31s) - - Variance suggests system factors rather than code regression - ---- - -### Phase 118: Safety Hardening + [[nodiscard]] -**Date**: Nov 21, 2025 -**Performance**: 4.36s avg ✅ - -**Comprehensive safety improvements:** - -#### Safety Improvements (5 additions) -1. **Table index bounds checking** (ltable.cpp:484) - - Added assertion for pointer arithmetic in hash table traversal - - Validates node pointer stays within allocated bounds - - Debug-mode protection against corruption - -2. **Stack reallocation overflow checks** (lstack.cpp:306-324) - - Protected size*1.5 calculation from integer overflow - - Safe ptrdiff_t to int conversion with overflow detection - - Gracefully handles edge cases by capping at MAXSTACK - -3. **ceillog2 input validation** (lobject.cpp:40) - - Added precondition assertion: x > 0 - - Documents that ceil(log2(0)) is undefined - - Prevents wraparound from x-- when x == 0 - -4. **Pointer arithmetic bounds** (ltable.cpp:415-425) - - Added bounds checking in getgeneric() hash chain traversal - - Validates n stays within [base, limit) range - - Catches corruption or logic errors in debug mode - -5. **luaO_rawarith return value checking** (lcode.cpp:803) - - Fixed ignored return value in constfolding() - - Properly handles operation failures - - Bug discovered by [[nodiscard]] attribute - -#### [[nodiscard]] Annotations (15+ functions) -Added to pure functions for compile-time safety: - -**Arithmetic operations**: -- luaV_idiv, luaV_mod, luaV_modf, luaV_shiftl - -**Comparison operations**: -- luaV_lessthan, luaV_lessequal, luaV_equalobj -- LTintfloat, LEintfloat, LTfloatint, LEfloatint -- l_strcmp - -**Object utilities**: -- luaO_ceillog2, luaO_codeparam, luaO_applyparam - -**Conversions and formatting**: -- luaO_utf8esc, luaO_rawarith, luaO_str2num -- luaO_tostringbuff, luaO_hexavalue - -**Impact**: Catches bugs at compile-time when return values are ignored - -**Files Modified** (7 files): -- src/objects/ltable.cpp: 2 bounds checks -- src/core/lstack.cpp: Stack reallocation overflow protection -- src/objects/lobject.cpp: ceillog2 validation -- src/compiler/lcode.cpp: Fixed luaO_rawarith return value check -- src/vm/lvm.h: 6 [[nodiscard]] annotations -- src/objects/lobject.h: 11 [[nodiscard]] annotations + 5 comparison helpers -- src/vm/lvm_comparison.cpp: 5 [[nodiscard]] annotations - -**Benefits**: -1. Debug-mode assertions catch corruption and logic errors -2. [[nodiscard]] prevents accidental ignored return values -3. Overflow protection handles edge cases gracefully -4. Zero runtime cost in release builds -5. Improved code safety and maintainability - -**Testing**: -- All 30+ test files pass: "final OK !!!" -- Performance: 4.36s average (4.14s-4.62s range) -- Target: ≤4.33s (3.8% from baseline, acceptable variance) -- Zero warnings with -Werror -- Zero release-build overhead (assertions only in debug) - ---- - -### Phase 119: C++ Standard Library Integration - std::array -**Date**: Nov 21, 2025 -**Performance**: 3.97s avg (-5.5% improvement!) 🎯 - -**Converted 4 fixed-size C arrays to std::array:** - -#### Part A: Local/Header Arrays -- **luaT_eventname** (ltm.cpp) - 25 tag method names -- **opnames** (lopnames.h) - 84 opcode names - -#### Part B: Global Arrays -- **luaT_typenames_** (ltm.cpp/ltm.h) - 12 type names -- **luaP_opmodes** (lopcodes.cpp/lopcodes.h) - 83 opcode modes - -#### Technical Details -- Used type aliases (TypeNamesArray, OpModesArray) to work around - LUAI_DDEC macro limitations with template commas -- All arrays are constexpr where possible for compile-time evaluation -- Zero-cost abstraction with better bounds checking in debug builds - -#### Performance Results -- Baseline: 4.20s avg -- Current: 3.97s avg (5-run benchmark) -- Change: **-5.5% (improvement!)** -- Target: ≤4.33s ✅ PASS - -**Benefits**: -- Better type safety (no array decay) -- Compile-time size information -- Improved compiler optimizations -- Modern C++23 best practices -- Debug-mode bounds checking - -**Files Modified** (5 files): -- src/compiler/lopcodes.cpp -- src/compiler/lopcodes.h -- src/core/ltm.cpp -- src/core/ltm.h -- src/vm/lopnames.h - -**All tests passing with "final OK !!!"** - ---- - -## Performance Timeline - -| Phase | Date | Performance | Change | Status | -|-------|------|-------------|--------|--------| -| Baseline | Nov 16, 2025 | 4.20s | - | ✅ | -| Phase 112 | Nov 2025 | 4.33s | +3.1% | ✅ At target | -| Phase 113 | Nov 2025 | 4.73s | +12.6% | ⚠️ Above target | -| Phase 114 | Nov 2025 | - | 0% | ✅ | -| Phase 115 | Nov 21, 2025 | 4.70s | +11.9% | ⚠️ Regression | -| Phase 116 | Nov 21, 2025 | 4.18s | -0.5% | ✅ Recovered! | -| Phase 117 | Nov 21, 2025 | 4.60s | +9.5% | ⚠️ Variance | -| Phase 118 | Nov 21, 2025 | 4.36s | +3.8% | ✅ Near target | -| Phase 119 | Nov 21, 2025 | 3.97s | **-5.5%** | 🎯 Best! | - -**Key Observations**: -- Phase 115 showed unexpected regression (11.9%) -- Phase 116 recovered performance -- Phase 119 achieved best performance yet (3.97s) -- High variance suggests system factors (not just code) - ---- - -## Statistics - -### Code Changes -- **Total lines**: ~35,124 -- **Files**: 84 source files (42 headers + 42 implementations) -- **Subdirectories**: 11 logical subdirectories -- **Macros converted**: ~500 (~99% complete) -- **Classes encapsulated**: 19/19 (100%) -- **Phases completed**: 119 - -### Quality Metrics -- **Code coverage**: 96.1% line coverage -- **Warnings**: Zero (compiles with -Werror) -- **Tests**: 30+ comprehensive test files -- **CI/CD**: Multi-compiler testing (GCC 13, Clang 15) - ---- - -## Key Milestones - -1. ✅ **Struct → Class Conversion** (Phases 1-50) -2. ✅ **Full Encapsulation** (Phases 37-89) -3. ✅ **SRP Refactoring** (Phases 90-92) -4. ✅ **LuaStack Centralization** (Phase 94) -5. ✅ **Enum Class Modernization** (Phases 96-100) -6. ✅ **GC Modularization** (Phase 101) -7. ✅ **Cast Modernization** (Phases 102-111) -8. ✅ **Type Safety Era** (Phases 112-119) - ---- - -## Archived Documentation - -For historical phase plans and completed work, see: -- `docs/ENCAPSULATION_PLAN.md` - ✅ Complete -- `docs/CONSTRUCTOR_PLAN.md` - ✅ Complete -- `docs/LUASTACK_AGGRESSIVE_PLAN.md` - ✅ Complete -- `docs/AGGRESSIVE_MACRO_ELIMINATION_PLAN.md` - ✅ Complete - ---- - -**End of History** - -For current status and next steps, see [CLAUDE.md](../CLAUDE.md). diff --git a/docs/INIT_TO_CONSTRUCTOR_ANALYSIS.md b/docs/INIT_TO_CONSTRUCTOR_ANALYSIS.md deleted file mode 100644 index 86396997..00000000 --- a/docs/INIT_TO_CONSTRUCTOR_ANALYSIS.md +++ /dev/null @@ -1,663 +0,0 @@ -# Initialization to Constructor Migration Analysis - -**Date**: 2025-11-16 -**Session**: claude/move-init-to-constructor-01UqjRaqAiTsGt1ebZmvvYHJ -**Status**: Analysis Complete -**Performance Target**: ≤2.21s (≤1% regression from 2.17s baseline) - ---- - -## Executive Summary - -This document analyzes the current state of object initialization patterns across the Lua C++ codebase, identifying opportunities to move initialization code into proper C++ constructors. The analysis reveals **one critical bug** (CallInfo incomplete initialization) and several opportunities for improved code safety and maintainability. - -### Key Findings - -1. 🔴 **CRITICAL BUG**: CallInfo has incomplete initialization (only 4/9 fields initialized) -2. ✅ **GOOD NEWS**: LuaAllocator and LuaVector already implemented and in use (parser structures) -3. ✅ **GOOD NEWS**: Most GC objects already use constructor pattern (CClosure, LClosure, Proto, UpVal) -4. ⚠️ **OPPORTUNITY**: lua_State and global_State use manual multi-phase initialization - ---- - -## Current State - -### Classes Using Constructor Pattern ✅ - -| Class | Constructor Status | Factory Method | Notes | -|-------|-------------------|----------------|-------| -| **Proto** | ✅ Comprehensive | `Proto::create()` | Perfect example - all fields initialized | -| **CClosure** | ✅ Complete | `CClosure::create()` | Handles variable-size upvalues correctly | -| **LClosure** | ✅ Complete | `LClosure::create()` | Has `initUpvals()` for post-construction setup | -| **UpVal** | ✅ Minimal | Inline creation | Constructor initializes `v` field | -| **Table** | ✅ Partial | `Table::create()` | Constructor + post-allocation setup | - -### Classes Needing Constructor Migration 🔴 - -| Priority | Class | Current Pattern | Lines of Init | Risk Level | -|----------|-------|-----------------|---------------|------------| -| **P0** 🔴 | **CallInfo** | Manual (INCOMPLETE) | ~4 | **CRITICAL** | -| **P0** 🔴 | **lua_State** | Manual multi-phase | ~50+ | **HIGH** | -| **P0** 🔴 | **global_State** | Manual multi-phase | ~50+ | **HIGH** | -| **P1** 🟡 | **Udata** | Has constructor, NOT used | ~8 | MEDIUM | -| **P1** 🟡 | **TString** | Manual (variable-size) | ~10-15 | MEDIUM | - -### Helper Classes (Low Priority) 🟢 - -| Class | Current Pattern | Priority | -|-------|-----------------|----------| -| **stringtable** | Manual setters | P2 - Low | -| **Upvaldesc** | Manual by parser | P2 - Low | -| **LocVar** | Manual by parser | P2 - Low | -| **AbsLineInfo** | Manual by parser | P2 - Low | - ---- - -## LuaAllocator Usage Analysis - -### ✅ Already Implemented - -1. **LuaAllocator** (src/memory/luaallocator.h) - - Standard C++17 allocator for Lua memory management - - Integrates with GC accounting - - Triggers emergency GC on allocation failure - - Zero overhead vs manual luaM_* calls - -2. **LuaVector** (src/memory/LuaVector.h) - - Convenient wrapper: `std::vector>` - - RAII, exception safety, standard container interface - - Works with STL algorithms - -3. **test_luaallocator.cpp** (src/testing/) - - Comprehensive test suite - - Tests basic operations, move semantics, GC integration - - All tests passing ✅ - -### ✅ Already in Production Use - -**Parser Data Structures** (Recent PR #16 - Merged): -- `Dyndata::actvar` - LuaVector -- `Dyndata::gt` - LuaVector -- `Dyndata::label` - LuaVector - -**Evidence from Code**: -```cpp -// src/core/ldo.cpp:1171 -SParser p(this); /* Initialize with lua_State - Dyndata uses LuaVector now */ - -// src/compiler/lparser.cpp:207 -var = dynData->actvar().allocateNew(); /* LuaVector automatically grows */ - -// src/compiler/lparser.cpp:638 -Labeldesc* desc = l->allocateNew(); /* LuaVector automatically grows */ -``` - -### Opportunities for Further Adoption - -Currently, the codebase does **NOT** use `std::vector` anywhere else (checked via grep). All dynamic arrays are either: -1. ✅ Intrusive GC lists (must remain as-is - performance critical) -2. ✅ Manual luaM_newvector/luaM_reallocvector allocations (GC-tracked) -3. ✅ LuaVector in parser (modern C++ pattern) - -**Conclusion**: LuaAllocator is being used appropriately. No immediate opportunities for further adoption beyond the current usage. - ---- - -## Critical Issue: CallInfo Incomplete Initialization 🔴 - -### The Bug - -**Location**: `src/core/lstate.cpp:80-91` - -```cpp -CallInfo *luaE_extendCI (lua_State *L) { - CallInfo *ci; - lua_assert(L->getCI()->getNext() == NULL); - ci = luaM_new(L, CallInfo); // ← Allocates memory, NO initialization! - lua_assert(L->getCI()->getNext() == NULL); - L->getCI()->setNext(ci); - ci->setPrevious(L->getCI()); // ← Only 4 fields initialized - ci->setNext(NULL); - ci->getTrap() = 0; - L->getNCIRef()++; - return ci; -} -``` - -### Fields Status in CallInfo (9 total) - -| Field | Type | Initialized in `luaE_extendCI`? | Initialized in `prepareCallInfo`? | **Status** | -|-------|------|--------------------------------|----------------------------------|------------| -| `func` | StkIdRel | ❌ NO | ✅ YES | ⚠️ **LATE** | -| `top` | StkIdRel | ❌ NO | ✅ YES | ⚠️ **LATE** | -| `previous` | CallInfo* | ✅ YES | - | ✅ OK | -| `next` | CallInfo* | ✅ YES | - | ✅ OK | -| `u.l.savedpc` | Instruction* | ❌ NO | ❌ NO | 🔴 **UNINITIALIZED** | -| `u.l.trap` | l_signalT | ✅ YES | - | ✅ OK | -| `u.l.nextraargs` | int | ❌ NO | ❌ NO | 🔴 **UNINITIALIZED** | -| `u2` | union (int) | ❌ NO | ❌ NO | 🔴 **UNINITIALIZED** | -| `callstatus` | l_uint32 | ❌ NO | ✅ YES | ⚠️ **LATE** | - -### Why This is Dangerous - -1. **Undefined Behavior**: Reading uninitialized union members can crash or corrupt state -2. **Hard to Debug**: Depends on memory allocator's behavior (what was in memory before?) -3. **Non-deterministic**: May work fine in debug builds, fail in release builds -4. **Maintenance Risk**: Easy to add code that assumes fields are initialized - -### The Fix (Recommended) - -**Add a constructor to CallInfo**: - -```cpp -class CallInfo { -private: - StkIdRel func; - StkIdRel top; - struct CallInfo *previous, *next; - union { /* ... */ } u; - union { /* ... */ } u2; - l_uint32 callstatus; - -public: - // Constructor: Initialize ALL fields to safe defaults - CallInfo() noexcept { - func.p = nullptr; - top.p = nullptr; - previous = nullptr; - next = nullptr; - - // Initialize union members to safe defaults - u.l.savedpc = nullptr; - u.l.trap = 0; - u.l.nextraargs = 0; - - u2.funcidx = 0; // All union members are int-sized, 0 is safe - - callstatus = 0; - } - - // Accessors remain unchanged... -}; -``` - -**Update allocation**: - -```cpp -CallInfo *luaE_extendCI (lua_State *L) { - CallInfo *ci; - lua_assert(L->getCI()->getNext() == NULL); - ci = new (luaM_malloc(L, sizeof(CallInfo))) CallInfo(); // ← Now calls constructor - lua_assert(L->getCI()->getNext() == NULL); - L->getCI()->setNext(ci); - ci->setPrevious(L->getCI()); - ci->setNext(NULL); - // trap already initialized to 0 in constructor, but set again for clarity - L->getNCIRef()++; - return ci; -} -``` - -**Benefits**: -- ✅ All 9 fields initialized to safe defaults -- ✅ Eliminates undefined behavior -- ✅ Single point of truth for initialization -- ✅ Zero performance cost (inline constructor) -- ✅ Future-proof (new fields auto-initialized) - -**Alternative (simpler)**: - -Use placement new with default constructor: - -```cpp -ci = new (luaM_malloc(L, sizeof(CallInfo))) CallInfo(); -``` - -Or even simpler, make luaM_new call the constructor (requires macro change). - ---- - -## Opportunity: lua_State Constructor - -### Current Pattern (Manual Multi-Phase) - -**Location**: `src/core/lstate.cpp:234-253` - -```cpp -static void preinit_thread (lua_State *L, global_State *g) { - G(L) = g; - L->getStack().p = NULL; - L->setCI(NULL); - L->setNCI(0); - L->setTwups(L); /* thread has no upvalues */ - L->setNCcalls(0); - L->setErrorJmp(NULL); - L->setHook(NULL); - L->setHookMask(0); - L->setBaseHookCount(0); - L->setAllowHook(1); - L->resetHookCount(); - L->setOpenUpval(NULL); - L->setStatus(LUA_OK); - L->setErrFunc(0); - L->setOldPC(0); - L->getBaseCI()->setPrevious(NULL); - L->getBaseCI()->setNext(NULL); -} -``` - -**Plus additional initialization in**: -- `stack_init()` - Stack and CallInfo initialization -- `resetCI()` - CallInfo reset - -### Issues with Current Pattern - -1. **Fragmented**: Initialization spread across 3+ functions -2. **Easy to Miss**: Adding a field to lua_State requires updating multiple locations -3. **No Compile-Time Verification**: Can't verify all fields initialized -4. **Order-Dependent**: Some functions must be called in specific order - -### Recommended Approach - -**Phase 1**: Add a constructor that matches `preinit_thread()`: - -```cpp -class lua_State : public GCBase { -private: - // ... 27 fields ... - -public: - // Constructor: Initialize all fields to safe defaults - lua_State(global_State* g) noexcept { - // GCBase fields initialized by GCBase constructor (if added) - - // Link to global state - G(this) = g; - - // Stack fields - stack.p = nullptr; - stack_last.p = nullptr; - tbclist.p = nullptr; - top.p = nullptr; - - // Call chain - ci = nullptr; - nci = 0; - - // GC tracking - openupval = nullptr; - gclist = nullptr; - twups = this; // thread has no upvalues - - // Error handling - status = LUA_OK; - errorJmp = nullptr; - errfunc = 0; - - // Debug hooks - hook = nullptr; - hookmask = 0; - allowhook = 1; - basehookcount = 0; - oldpc = 0; - resetHookCount(); - - // Call depth - nCcalls = 0; - - // Base CallInfo (embedded) - base_ci.setPrevious(nullptr); - base_ci.setNext(nullptr); - // ... initialize other base_ci fields - } - - // ... rest of class unchanged ... -}; -``` - -**Phase 2**: Update allocation to use constructor: - -```cpp -static lua_State *newstate (lua_Alloc f, void *ud) { - global_State *g = cast(global_State *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG))); - if (g == NULL) return NULL; - - // Initialize global_State with constructor (Phase 3) - new (g) global_State(f, ud); - - lua_State *L = &g->l.l; - new (L) lua_State(g); // ← Call constructor instead of preinit_thread - - // Continue with allocating setup (stack_init, etc.) - // ... -} -``` - -**Benefits**: -- ✅ Single point of truth for initialization -- ✅ Compile-time verification (constructor must initialize all fields or compiler warns) -- ✅ Easier maintenance (new fields automatically caught by compiler) -- ✅ Self-documenting code - -**Risks**: -- ⚠️ Must ensure constructor is inline (performance) -- ⚠️ Must test thoroughly (state machine has complex initialization) -- ⚠️ May need two-phase init (pre-allocation vs post-allocation) - ---- - -## Opportunity: global_State Constructor - -### Current Pattern - -Similar to lua_State, global_State has manual initialization spread across: -- Allocator in `lua_newstate()` -- GC parameters -- String table initialization in `luaS_init()` -- Type metatables in `luaT_init()` -- Parser token names in `luaX_init()` - -### Recommendation - -**Lower priority than CallInfo and lua_State** because: -1. Only allocated once (not a hot path) -2. Less risk of uninitialized fields (initialized carefully in one function) -3. More complex due to external dependencies (string table, etc.) - -**Defer to Phase 2 or 3** after CallInfo and lua_State constructors proven successful. - ---- - -## Implementation Roadmap - -### Phase 1: Critical Bug Fix (HIGH PRIORITY) 🔴 - -**Goal**: Fix CallInfo incomplete initialization -**Effort**: 2-4 hours -**Risk**: Low (fixes a bug) - -**Tasks**: -1. Add `CallInfo() noexcept` constructor -2. Update `luaE_extendCI` to call constructor -3. Build and test -4. Benchmark (should be identical performance) -5. Commit: "Fix CallInfo incomplete initialization with constructor" - -**Success Criteria**: -- ✅ All 9 CallInfo fields initialized -- ✅ All tests pass (testes/all.lua) -- ✅ Performance ≤2.21s -- ✅ Zero undefined behavior - ---- - -### Phase 2: lua_State Constructor (MEDIUM PRIORITY) 🟡 - -**Goal**: Consolidate lua_State initialization -**Effort**: 6-10 hours -**Risk**: Medium (complex state machine) - -**Tasks**: -1. Design constructor signature (may need separate preinit vs full init) -2. Implement `lua_State(global_State*)` constructor -3. Update `preinit_thread()` to use constructor -4. Update `stack_init()` if needed -5. Build and test after each change -6. Benchmark -7. Commit: "Add lua_State constructor for safer initialization" - -**Success Criteria**: -- ✅ All 27+ lua_State fields initialized in constructor -- ✅ Eliminate `preinit_thread()` function -- ✅ All tests pass -- ✅ Performance ≤2.21s - ---- - -### Phase 3: global_State Constructor (LOW PRIORITY) 🟢 - -**Goal**: Consolidate global_State initialization -**Effort**: 8-12 hours -**Risk**: Medium (many external dependencies) - -**Tasks**: -1. Analyze initialization dependencies -2. Design two-phase init if needed (basic constructor + `init()` method) -3. Implement constructor -4. Migrate initialization from `lua_newstate()` -5. Build and test -6. Benchmark -7. Commit: "Add global_State constructor" - -**Success Criteria**: -- ✅ All 46+ global_State fields initialized -- ✅ Clearer initialization flow -- ✅ All tests pass -- ✅ Performance ≤2.21s - ---- - -### Phase 4: Helper Classes (OPTIONAL) 🟢 - -**Goal**: Add constructors to small helper classes -**Effort**: 4-6 hours total -**Risk**: Very low - -**Classes**: -- stringtable -- Upvaldesc -- LocVar -- AbsLineInfo - -**Pattern**: Simple default constructors that zero-initialize fields - -**Benefit**: Consistency, future-proofing, minimal effort - ---- - -## Performance Considerations - -### Why Constructors Won't Hurt Performance - -1. **Inline Constructors**: All constructors will be `inline` or `constexpr` -2. **Zero-Cost Abstraction**: Modern compilers optimize away constructor calls -3. **Same Machine Code**: Should compile to identical assembly as manual initialization -4. **Verified**: Must benchmark after each phase - -### Benchmark Protocol - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -for i in 1 2 3 4 5; do \ - ../build/lua all.lua 2>&1 | grep "total time:"; \ -done - -# Average must be ≤2.21s -# Baseline: 2.17s (from CLAUDE.md) -# Recent: 2.08s avg (SRP refactoring Phase 90-92) -``` - -### Hot Path Analysis - -**CallInfo**: -- Created on every function call -- Must be ultra-fast -- Constructor should be fully inlined -- **Verify assembly output** if concerned - -**lua_State**: -- Created once per thread (rare) -- Not a hot path -- Constructor overhead acceptable - -**global_State**: -- Created once per VM instance -- Definitely not a hot path -- Constructor overhead irrelevant - ---- - -## Testing Strategy - -### Unit Tests - -1. **CallInfo**: Verify all fields initialized to safe defaults -2. **lua_State**: Verify complete initialization -3. **global_State**: Verify complete initialization - -### Integration Tests - -```bash -# Full test suite must pass -cd testes -../build/lua all.lua -# Expected: "final OK !!!" -``` - -### Memory Safety - -Consider running with sanitizers after changes: - -```bash -cmake -B build -DCMAKE_BUILD_TYPE=Debug \ - -DLUA_ENABLE_ASAN=ON \ - -DLUA_ENABLE_UBSAN=ON -cmake --build build -cd testes && ../build/lua all.lua -``` - ---- - -## Alternatives Considered - -### Alternative 1: Keep Manual Initialization - -**Pros**: -- No code changes needed -- Zero risk - -**Cons**: -- ❌ Leaves CallInfo bug unfixed -- ❌ Fragile (easy to miss fields) -- ❌ Not idiomatic C++ - -**Verdict**: ❌ Rejected - CallInfo bug is critical - ---- - -### Alternative 2: Use C++20 Designated Initializers - -```cpp -CallInfo ci = { - .func = {nullptr}, - .top = {nullptr}, - .previous = nullptr, - .next = nullptr, - // ... -}; -``` - -**Pros**: -- Clear initialization -- Compiler verifies all fields - -**Cons**: -- Requires C++20 (we're on C++23, so OK) -- Doesn't work well with unions -- Doesn't work with placement new - -**Verdict**: ⚠️ Partial solution - doesn't solve placement new issue - ---- - -### Alternative 3: Macro Wrapper for luaM_new - -```cpp -#define luaM_new_init(L, T) new (luaM_malloc(L, sizeof(T))) T() -``` - -**Pros**: -- Easy to implement -- Works with placement new - -**Cons**: -- Hides complexity -- Macros discouraged in C++23 -- Less clear than explicit factory methods - -**Verdict**: ⚠️ Could work, but constructors are cleaner - ---- - -## Related Work - -### Completed - -- ✅ **CONSTRUCTOR_PLAN.md** - Original constructor pattern plan (Phase 34) -- ✅ **Constructor pattern** - Implemented for CClosure, LClosure, Proto, UpVal -- ✅ **LuaAllocator** - Standard allocator (PR #15) -- ✅ **LuaVector** - Wrapper for std::vector (PR #15) -- ✅ **Parser data structures** - Converted to LuaVector (PR #16) - -### In Progress - -- 🔄 **This analysis** - Identifying remaining initialization issues - -### Planned - -- 📋 **CallInfo constructor** - Critical bug fix -- 📋 **lua_State constructor** - Safety improvement -- 📋 **global_State constructor** - Optional consistency improvement - ---- - -## Recommendations Summary - -### Immediate Action (This Week) - -1. 🔴 **Fix CallInfo initialization bug** (Phase 1) - - Add constructor - - Update luaE_extendCI - - Test and benchmark - - **CRITICAL PRIORITY** - -### Short Term (Next 2 Weeks) - -2. 🟡 **Add lua_State constructor** (Phase 2) - - Consolidate initialization - - Improve maintainability - - Test and benchmark - -### Long Term (Optional) - -3. 🟢 **Add global_State constructor** (Phase 3) - - Lower priority - - Consistency benefit - - Defer if time-constrained - -4. 🟢 **Helper class constructors** (Phase 4) - - Very low priority - - Nice-to-have - - Minimal effort - ---- - -## Conclusion - -The Lua C++ codebase has made excellent progress with: -- ✅ LuaAllocator and LuaVector implemented and in production use -- ✅ Most GC objects using proper constructor pattern -- ✅ Modern C++23 practices - -However, there is **one critical bug** (CallInfo incomplete initialization) that should be fixed immediately. - -**Recommended Action**: Implement Phase 1 (CallInfo constructor) this week. Consider Phase 2 (lua_State) in the next sprint. Phase 3-4 are optional improvements. - -**Expected Outcome**: Safer, more maintainable code with zero performance regression. - ---- - -**END OF ANALYSIS** diff --git a/docs/LOOP_OPTIMIZATION_ANALYSIS.md b/docs/LOOP_OPTIMIZATION_ANALYSIS.md deleted file mode 100644 index b831ec2a..00000000 --- a/docs/LOOP_OPTIMIZATION_ANALYSIS.md +++ /dev/null @@ -1,859 +0,0 @@ -# Loop Type Optimization Analysis -## Comprehensive Scan of lua_cpp Codebase - -**Scan Date**: November 21, 2025 -**Focus Areas**: Hot-path files (lvm.cpp, ldo.cpp, ltable.cpp, lgc.cpp, lstring.cpp) + supporting modules -**Total Patterns Found**: 45+ loop patterns with optimization opportunities - ---- - -## Priority 1: HOT-PATH VM CORE ISSUES - -### 1.1 lvm.cpp - OP_LOADNIL Loop (Line 808-810) -**File**: `/home/user/lua_cpp/src/vm/lvm.cpp` -**Type**: Decrement-style loop in VM instruction handler -**Lines**: 808-810 - -**Current Pattern**: -```cpp -int b = InstructionView(i).b(); -do { - setnilvalue(s2v(ra++)); -} while (b--); -``` - -**Issues**: -- Post-decrement in loop condition: `b--` creates unnecessary copy -- VM hot path - executed millions of times per test run -- Inefficient for modern CPUs (extra register copy) - -**Optimization**: -```cpp -int b = InstructionView(i).b(); -while (b-- > 0) { // Clearer intent - setnilvalue(s2v(ra++)); -} -// OR better: -for (int i = 0; i < b; i++) { - setnilvalue(s2v(ra++)); -} -``` - -**Impact**: Micro-optimization in VM hot path (OP_LOADNIL is frequent) - ---- - -## Priority 2: TYPE MISMATCH - UNSIGNED/SIGNED IN TABLES - -### 2.1 ltable.cpp - Hash Table Initialization (Line 738) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Unsigned loop counter with size -**Lines**: 738-743 - -**Current Pattern**: -```cpp -unsigned int size = t->nodeSize(); // unsigned int -for (unsigned int i = 0; i < size; i++) { - Node *n = gnode(t, i); - gnext(n) = 0; - n->setKeyNil(); - setempty(gval(n)); -} -``` - -**Type Analysis**: -- `nodeSize()` returns `unsigned int` -- Consistent types (unsigned int to unsigned int) -- ✓ **NO TYPE MISMATCH** - Already optimized! - ---- - -### 2.2 ltable.cpp - Hash Array Rehashing (Line 754) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Unsigned loop over hash table nodes -**Lines**: 752-764 - -**Current Pattern**: -```cpp -unsigned j; -unsigned size = ot->nodeSize(); // Returns unsigned int -for (j = 0; j < size; j++) { - Node *old = gnode(ot, j); - if (!isempty(gval(old))) { - TValue k; - old->getKey(L, &k); - newcheckedkey(t, &k, gval(old)); - } -} -``` - -**Analysis**: -- Type consistency: unsigned to unsigned ✓ -- **Excellent pattern** - no optimization needed -- Hot path benefit from explicit loop - ---- - -### 2.3 ltable.cpp - String Table Rehashing (Line 73) -**File**: `/home/user/lua_cpp/src/objects/lstring.cpp` -**Type**: Traditional for loop with pointer-based container -**Lines**: 71-84 - -**Current Pattern**: -```cpp -for (i = 0; i < osize; i++) { /* rehash old part of the array */ - TString *p = vect[i]; - vect[i] = nullptr; - while (p) { /* for each string in the list */ - TString *hnext = p->getNext(); - unsigned int h = lmod(p->getHash(), nsize); - p->setNext(vect[h]); - vect[h] = p; - p = hnext; - } -} -``` - -**Issues**: -- `i` type not declared in outer loop (assumed int) -- Pointer iteration in inner while loop -- Double loop structure (for + while) is correct but could clarify loop count type - -**Recommendation**: -```cpp -for (size_t i = 0; i < osize; i++) { // Explicitly size_t - TString *p = vect[i]; - vect[i] = nullptr; - while (p) { /* for each string in the list */ - TString *hnext = p->getNext(); - unsigned int h = lmod(p->getHash(), nsize); - p->setNext(vect[h]); - vect[h] = p; - p = hnext; - } -} -``` - -**Impact**: Consistency and clarity (not performance-critical in rehashing) - ---- - -## Priority 3: DECREMENT LOOPS - TYPE SAFETY - -### 3.1 ltable.cpp - Hash Node Linear Search (Lines 999-1005) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Unsigned decrement-based linear search -**Lines**: 999-1005 - -**Current Pattern**: -```cpp -unsigned i = t->nodeSize(); // Unsigned int -while (i--) { /* do a linear search */ - Node *free = gnode(t, i); - if (free->isKeyNil()) - return free; -} -``` - -**Type Analysis**: -- `nodeSize()` returns `unsigned int` -- Decrement on unsigned: `i--` stops at 0 (wraps, but loop already ends) -- ✓ **Safe but subtle** - Post-decrement in condition -- **Potential issue**: If `nodeSize()` returns 0, `i` becomes UINT_MAX and loops all memory! - -**Safer Alternative**: -```cpp -unsigned i = t->nodeSize(); -while (i > 0) { /* do a linear search */ - i--; - Node *free = gnode(t, i); - if (free->isKeyNil()) - return free; -} -``` - -**Or Modern C++**: -```cpp -// Pre-decrement version -unsigned int size = t->nodeSize(); -if (size > 0) { - unsigned int i = size; - do { - Node *free = gnode(t, --i); - if (free->isKeyNil()) - return free; - } while (i > 0); -} -``` - -**Impact**: Safety - prevents potential infinite loop on empty tables - ---- - -### 3.2 ltable.cpp - Hash Table Counter Loop (Line 644) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Unsigned decrement counter -**Lines**: 641-656 - -**Current Pattern**: -```cpp -unsigned i = t->nodeSize(); // unsigned int -unsigned total = 0; -while (i--) { - const Node *n = &t->getNodeArray()[i]; - if (isempty(gval(n))) { - lua_assert(!n->isKeyNil()); - ct->deleted = 1; - } - else { - total++; - if (n->isKeyInteger()) - countint(n->getKeyIntValue(), ct); - } -} -``` - -**Analysis**: -- ✓ Same safe pattern as 3.1 -- **Consistently used pattern** in table module -- Works correctly due to loop exit condition (i-- evaluated first) - -**Recommendation**: Keep as-is, but document that pattern is safe - ---- - -## Priority 4: CAST-INT PATTERNS - NARROWING CONVERSIONS - -### 4.1 funcstate.cpp - Variable Iteration with Cast (Line 238) -**File**: `/home/user/lua_cpp/src/compiler/funcstate.cpp` -**Type**: Downcast from size_t to int -**Lines**: 236-249 - -**Current Pattern**: -```cpp -int FuncState::searchvar(TString *n, expdesc *var) { - int i; - for (i = cast_int(getNumActiveVars()) - 1; i >= 0; i--) { - Vardesc *vd = getlocalvardesc(i); - // ... process variable - } -} -``` - -**Type Analysis**: -- `getNumActiveVars()` likely returns `size_t` or `unsigned` -- `cast_int()` converts to signed int (potential loss of range) -- **Hot path**: Variable lookup during compilation -- Loop uses signed int (correct for decrement to -1) - -**Issues**: -- Narrowing conversion: size_t → int -- If `getNumActiveVars()` > INT_MAX, undefined behavior -- Better to keep counting with int from start - -**Recommendation**: -```cpp -// GOOD: Use int consistently if count expected small -int FuncState::searchvar(TString *n, expdesc *var) { - int nactive = static_cast(getNumActiveVars()); // Explicit cast - for (int i = nactive - 1; i >= 0; i--) { - Vardesc *vd = getlocalvardesc(i); - // ... - } -} - -// BETTER: Use proper range if large counts possible -int FuncState::searchvar(TString *n, expdesc *var) { - auto nactive = getNumActiveVars(); - if (nactive > INT_MAX) return -1; // Error case - for (int i = static_cast(nactive) - 1; i >= 0; i--) { - // ... - } -} -``` - -**Impact**: Type safety + compiler safety checks - ---- - -### 4.2 ltable.cpp - Node Offset Calculation with Cast (Line 1143) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Pointer arithmetic with cast_int -**Lines**: 1140-1145 - -**Current Pattern**: -```cpp -return cast_int((reinterpret_cast(slot) - t->getNodeArray())) + HFIRSTNODE; -``` - -**Type Analysis**: -- Pointer subtraction (Node* - Node*) = ptrdiff_t -- `cast_int()` converts ptrdiff_t → int -- **Hot path**: Table access during GC - -**Type Chain**: -``` -Node* (slot) - ↓ reinterpret_cast -const Node* - ↓ - (pointer subtraction) -ptrdiff_t - ↓ cast_int -int -``` - -**Optimization**: -```cpp -// More explicit: -ptrdiff_t offset = reinterpret_cast(slot) - t->getNodeArray(); -return static_cast(offset) + HFIRSTNODE; - -// Or with proper range checking: -ptrdiff_t offset = reinterpret_cast(slot) - t->getNodeArray(); -lua_assert(offset >= 0 && offset < INT_MAX); -return static_cast(offset) + HFIRSTNODE; -``` - -**Impact**: Clarity - makes conversion explicit - ---- - -## Priority 5: SIGNED/UNSIGNED BOUNDARY LOOPS - -### 5.1 ltablib.cpp - Table Insert with Type Mix (Line 87) -**File**: `/home/user/lua_cpp/src/libraries/ltablib.cpp` -**Type**: Signed loop counter with comparison -**Lines**: 82-91 - -**Current Pattern**: -```cpp -lua_Integer pos = luaL_checkinteger(L, 2); // Signed: lua_Integer -lua_Unsigned e = ... // Unsigned: lua_Unsigned - -// Type check (unsigned comparison): -luaL_argcheck(L, (lua_Unsigned)pos - 1u < (lua_Unsigned)e, 2, - "position out of bounds"); - -// Loop with signed counter: -for (i = e; i > pos; i--) { /* move up elements */ - lua_geti(L, 1, i - 1); - lua_seti(L, 1, i); -} -``` - -**Issues**: -- `pos` is `lua_Integer` (signed) -- `e` is `lua_Unsigned` or `lua_Integer` (needs verification) -- Loop uses `i` which should match position type -- **Implicit conversion warning** area - -**Recommendation**: -```cpp -lua_Integer pos = luaL_checkinteger(L, 2); -lua_Integer e = /* get element count as lua_Integer */; - -// Now both are signed, comparison is clean: -for (lua_Integer i = e; i > pos; i--) { - lua_geti(L, 1, i - 1); - lua_seti(L, 1, i); -} -``` - -**Impact**: Type safety - eliminates implicit unsigned/signed conversion - ---- - -### 5.2 ltablib.cpp - Table Copy with Decrement (Line 147) -**File**: `/home/user/lua_cpp/src/libraries/ltablib.cpp` -**Type**: Signed loop with decrement -**Lines**: 142-151 - -**Current Pattern**: -```cpp -for (i = n - 1; i >= 0; i--) { - lua_geti(L, 1, f + i); - lua_seti(L, tt, t + i); -} -``` - -**Analysis**: -- `i` is `lua_Integer` (signed) -- Loop is clean: `i >= 0` is clear termination -- ✓ **Good pattern** - no issues - ---- - -## Priority 6: LEXER/PARSER LOOPS - -### 6.1 llex.cpp - UTF-8 Buffer Handling (Line 364) -**File**: `/home/user/lua_cpp/src/compiler/llex.cpp` -**Type**: Decrement-based character accumulation -**Lines**: 360-366 - -**Current Pattern**: -```cpp -int n = luaO_utf8esc(utf8buffer, readUtf8Esc()); -for (; n > 0; n--) /* add 'utf8buffer' to string */ - save(utf8buffer[UTF8BUFFSZ - n]); -``` - -**Type Analysis**: -- `n` is `int` (return from luaO_utf8esc) -- Loop: decrement from n to 0 -- **Correct pattern**: Pre-test ensures n > 0 before first iteration -- Index calculation: `UTF8BUFFSZ - n` starts from right side - -**Analysis**: -- ✓ **Safe and correct** - no optimization needed -- Decrement used intentionally for reverse iteration - ---- - -### 6.2 llex.cpp - Decimal Digit Reading (Line 372) -**File**: `/home/user/lua_cpp/src/compiler/llex.cpp` -**Type**: Loop with int counter and character condition -**Lines**: 369-374 - -**Current Pattern**: -```cpp -int i; -int r = 0; -for (i = 0; i < 3 && lisdigit(getCurrentChar()); i++) { - r = 10*r + getCurrentChar() - '0'; - saveAndNext(); -} -``` - -**Type Analysis**: -- `i` is `int` (0-3 range only) -- ✓ **Good pattern** - no type issues -- Dual condition termination (count AND char check) - ---- - -## Priority 7: BINARY SEARCH PATTERNS - -### 7.1 ltable.cpp - Hash Binary Search (Lines 1261-1265) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Unsigned binary search with subtraction -**Lines**: 1261-1265 - -**Current Pattern**: -```cpp -while (j - i > 1u) { /* do a binary search between them */ - lua_Unsigned m = (i + j) / 2; - if (hashkeyisempty(t, m)) j = m; - else i = m; -} -return i; -``` - -**Type Analysis**: -- `i`, `j` are `lua_Unsigned` -- `m` is `lua_Unsigned` (correctly sized) -- Subtraction `j - i` is `lua_Unsigned` (safe) -- ✓ **Excellent pattern** - proper unsigned arithmetic - ---- - -### 7.2 ltable.cpp - Array Binary Search (Lines 1270-1278) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Signed binary search -**Lines**: 1270-1278 - -**Current Pattern**: -```cpp -static unsigned int binsearch (Table *array, unsigned int i, unsigned int j) { - lua_assert(i <= j); - while (j - i > 1u) { /* binary search */ - unsigned int m = (i + j) / 2; - if (arraykeyisempty(array, m)) j = m; - else i = m; - } - return i; -} -``` - -**Type Analysis**: -- Function uses `unsigned int` consistently throughout -- ✓ **Good pattern** - no type mismatches -- Subtraction `j - i > 1u` is safe with unsigned - ---- - -## Priority 8: POINTER ARITHMETIC LOOPS - -### 8.1 ltable.cpp - Pointer Chain Following (Line 1032-1033) -**File**: `/home/user/lua_cpp/src/objects/ltable.cpp` -**Type**: Pointer arithmetic with comparison -**Lines**: 1030-1039 - -**Current Pattern**: -```cpp -while (othern + gnext(othern) != mp) { /* find previous */ - othern += gnext(othern); -} -gnext(othern) = cast_int(f - othern); /* rechain to point to 'f' */ -``` - -**Type Analysis**: -- `othern` is `Node*` (pointer) -- `gnext(othern)` returns `int` (offset to next) -- `othern + gnext(othern)` pointer arithmetic ✓ -- Comparison `!= mp` compares pointers ✓ -- ✓ **Correct pattern** - intentional pointer arithmetic - ---- - -### 8.2 lstring.cpp - Pointer Iteration (Line 76) -**File**: `/home/user/lua_cpp/src/objects/lstring.cpp` -**Type**: Pointer-based while loop -**Lines**: 71-83 - -**Current Pattern**: -```cpp -for (i = 0; i < osize; i++) { - TString *p = vect[i]; - vect[i] = nullptr; - while (p) { /* for each string in the list */ - TString *hnext = p->getNext(); - unsigned int h = lmod(p->getHash(), nsize); - p->setNext(vect[h]); - vect[h] = p; - p = hnext; - } -} -``` - -**Type Analysis**: -- `p` is `TString*` -- While condition checks pointer (safe conversion to bool) -- ✓ **Good pattern** - pointer iteration is idiomatic C++ - ---- - -## Priority 9: GC ITERATION PATTERNS - -### 9.1 lgc.cpp - GC List Traversal (Line 582) -**File**: `/home/user/lua_cpp/src/memory/lgc.cpp` -**Type**: Pointer-based list iteration -**Lines**: 580-595 - -**Current Pattern**: -```cpp -GCObject *curr; -while ((curr = *p) != nullptr) { - GCObject **next = getgclist(curr); - if (iswhite(curr)) - goto remove; - else if (getage(curr) == GCAge::Touched1) { - // ... - } -} -``` - -**Type Analysis**: -- `curr` is `GCObject*` -- `p` is `GCObject**` (pointer to pointer) -- Assignment and null-check in condition ✓ -- ✓ **Idiomatic C pattern** (correctly used pointer dereference) - ---- - -### 9.2 lgc.cpp - Mark Propagation (Line 360) -**File**: `/home/user/lua_cpp/src/memory/lgc.cpp` -**Type**: Pointer-based while loop -**Lines**: 359-362 - -**Current Pattern**: -```cpp -void propagateall (global_State *g) { - while (g->getGray()) - propagatemark(g); -} -``` - -**Type Analysis**: -- `getGray()` returns pointer-like object (GCObject*) -- While condition: pointer-to-bool conversion ✓ -- ✓ **Clean and safe** - ---- - -## Priority 10: API LOOPS - -### 10.1 lapi.cpp - Stack Value Transfer (Line 86) -**File**: `/home/user/lua_cpp/src/core/lapi.cpp` -**Type**: Simple incrementing loop -**Lines**: 83-91 - -**Current Pattern**: -```cpp -for (i = 0; i < n; i++) { - *s2v(to->getTop().p) = *s2v(from->getTop().p + i); - to->getStackSubsystem().push(); -} -``` - -**Type Analysis**: -- `i` is loop variable (type not shown, assumed int) -- `n` is loop count parameter -- ✓ **Good pattern** - simple iteration - ---- - -### 10.2 lapi.cpp - Stack Adjustment Loop (Line 141) -**File**: `/home/user/lua_cpp/src/core/lapi.cpp` -**Type**: ptrdiff_t loop with comparison -**Lines**: 138-145 - -**Current Pattern**: -```cpp -ptrdiff_t diff; -// ... -for (; diff > 0; diff--) { - setnilvalue(s2v(L->getTop().p)); - L->getStackSubsystem().push(); -} -``` - -**Type Analysis**: -- `diff` is `ptrdiff_t` (signed pointer difference type) -- Loop: `diff > 0` then `diff--` (correct signed arithmetic) -- ✓ **Correct pattern** - proper use of ptrdiff_t - ---- - -### 10.3 lapi.cpp - Value Reversal Loop (Line 179) -**File**: `/home/user/lua_cpp/src/core/lapi.cpp` -**Type**: Pointer-based two-way iteration -**Lines**: 178-185 - -**Current Pattern**: -```cpp -static void reverse (lua_State *L, StkId from, StkId to) { - for (; from < to; from++, to--) { - TValue temp = *s2v(from); - *s2v(from) = *s2v(to); - L->getStackSubsystem().setSlot(to, &temp); - } -} -``` - -**Type Analysis**: -- `from`, `to` are `StkId` (pointer-like, likely `TValue*`) -- Loop: converging pointers from both ends ✓ -- ✓ **Correct pattern** - bidirectional pointer iteration - ---- - -### 10.4 lapi.cpp - CClosure Upvalue Setup (Line 576) -**File**: `/home/user/lua_cpp/src/core/lapi.cpp` -**Type**: Simple upvalue initialization loop -**Lines**: 573-580 - -**Current Pattern**: -```cpp -for (i = 0; i < n; i++) { - *cl->getUpvalue(i) = *s2v(L->getTop().p - n + i); -} -``` - -**Type Analysis**: -- `i` is loop variable (type not shown) -- `n` is parameter (upvalue count) -- ✓ **Good pattern** - ---- - -### 10.5 lapi.cpp - Closure Upvalue Lookup (Line 717) -**File**: `/home/user/lua_cpp/src/core/ldebug.cpp` -**Type**: Simple counting loop -**Lines**: 714-724 - -**Current Pattern**: -```cpp -int i; -for (i = 0; i < c->getNumUpvalues(); i++) { - if (c->getUpval(i)->getVP() == o) { - *name = upvalname(c->getProto(), i); - return strupval; - } -} -``` - -**Type Analysis**: -- `i` is `int` -- `getNumUpvalues()` returns upvalue count (should be int-compatible) -- ✓ **Good pattern** - ---- - -## Priority 11: STRLIBRARY LOOPS - -### 11.1 lstrlib.cpp - Integer Unpacking (Lines 1739-1754) -**File**: `/home/user/lua_cpp/src/libraries/lstrlib.cpp` -**Type**: Mixed loop counters -**Lines**: 1735-1754 - -**Current Pattern**: -```cpp -static lua_Integer unpackint (lua_State *L, const char *str, - int islittle, int size, int issigned) { - lua_Unsigned res = 0; - int i; - int limit = (size <= SZINT) ? size : SZINT; - for (i = limit - 1; i >= 0; i--) { // Decrement from limit-1 to 0 - res <<= NB; - res |= (lua_Unsigned)(unsigned char)str[islittle ? i : size - 1 - i]; - } - // ... - if (size > SZINT) { - for (i = limit; i < size; i++) { // Increment from limit to size - if (l_unlikely((unsigned char)str[islittle ? i : size - 1 - i] != mask)) - luaL_error(L, "%d-byte integer does not fit into Lua Integer", size); - } - } -} -``` - -**Type Analysis**: -- First loop: `for (i = limit - 1; i >= 0; i--)` ✓ Safe signed decrement -- Second loop: `for (i = limit; i < size; i++)` ✓ Safe signed increment -- Both use `int` which is appropriate for byte counts -- ✓ **Correct pattern** - ---- - -### 11.2 lstrlib.cpp - Format Digit Addition (Line 1044) -**File**: `/home/user/lua_cpp/src/libraries/lstrlib.cpp` -**Type**: Do-while decrement loop -**Lines**: 1040-1045 - -**Current Pattern**: -```cpp -if (m > 0) { - buff[n++] = lua_getlocaledecpoint(); - do { - m = adddigit(buff, n++, m * 16); - } while (m > 0); -} -``` - -**Type Analysis**: -- `m` is result of arithmetic operations -- Loop: decrement `m` until 0 -- ✓ **Good pattern** - do-while ensures at least one iteration - ---- - -## SUMMARY TABLE - -| Priority | Issue | File | Lines | Type | Severity | Recommendation | -|----------|-------|------|-------|------|----------|-----------------| -| 1 | Post-decrement in VM | lvm.cpp | 808-810 | Performance | Medium | Use `--m > 0` or `for` loop | -| 2 | Hash init loop | ltable.cpp | 738-743 | Type | Low | ✓ Already optimized | -| 2 | Hash rehash | ltable.cpp | 754 | Type | Low | ✓ Already optimized | -| 2 | String rehash | lstring.cpp | 73 | Type | Low | Make loop var explicit `size_t` | -| 3 | Node search decrement | ltable.cpp | 999-1005 | Safety | Medium | Safer: pre-decrement check | -| 3 | Counter decrement | ltable.cpp | 644 | Type | Low | ✓ Safe pattern, document it | -| 4 | Var count cast | funcstate.cpp | 238 | Type | Medium | Explicit `static_cast` | -| 4 | Pointer offset cast | ltable.cpp | 1143 | Type | Low | Make conversion explicit | -| 5 | Table insert signed | ltablib.cpp | 87 | Type | Medium | Use consistent `lua_Integer` | -| 5 | Table copy | ltablib.cpp | 147 | Type | Low | ✓ Good pattern | -| 6 | UTF-8 buffer | llex.cpp | 364 | Type | Low | ✓ Correct pattern | -| 6 | Decimal digits | llex.cpp | 372 | Type | Low | ✓ Good pattern | -| 7 | Hash binary search | ltable.cpp | 1261-1265 | Type | Low | ✓ Excellent unsigned math | -| 7 | Array binary search | ltable.cpp | 1270-1278 | Type | Low | ✓ Good pattern | -| 8 | Pointer chain | ltable.cpp | 1032-1033 | Type | Low | ✓ Correct pointer arithmetic | -| 8 | String list | lstring.cpp | 76 | Type | Low | ✓ Good pattern | -| 9 | GC list traverse | lgc.cpp | 582 | Type | Low | ✓ Idiomatic C pattern | -| 9 | Mark propagation | lgc.cpp | 360 | Type | Low | ✓ Clean pattern | -| 10 | Stack transfer | lapi.cpp | 86 | Type | Low | ✓ Good pattern | -| 10 | Stack adjust | lapi.cpp | 141 | Type | Low | ✓ Correct ptrdiff_t | -| 10 | Value reverse | lapi.cpp | 179 | Type | Low | ✓ Correct pattern | -| 10 | Upvalue setup | lapi.cpp | 576 | Type | Low | ✓ Good pattern | -| 10 | Upvalue lookup | ldebug.cpp | 717 | Type | Low | ✓ Good pattern | -| 11 | Int unpacking | lstrlib.cpp | 1739-1754 | Type | Low | ✓ Correct pattern | -| 11 | Format digit | lstrlib.cpp | 1044 | Type | Low | ✓ Good pattern | - ---- - -## KEY FINDINGS - -### Code Quality Assessment -- **Overall loop quality**: EXCELLENT ✓ -- **Type safety**: 88% of loops are type-safe -- **Patterns used**: Mix of modern C++ and traditional C - both appropriate for their contexts - -### Hot-Path Performance Opportunities -1. **VM (lvm.cpp)**: One micro-optimization in OP_LOADNIL (line 808-810) -2. **Tables (ltable.cpp)**: Loops already well-optimized -3. **GC (lgc.cpp)**: Pointer-based patterns are appropriate - -### Type Safety Improvements (Priority Order) -1. **funcstate.cpp:238** - Explicit `static_cast` instead of `cast_int` -2. **ltablib.cpp:87** - Consistent use of `lua_Integer` for positions -3. **lstring.cpp:73** - Make loop counter explicitly `size_t` - -### Range-Based For Loop Opportunities -The codebase uses pointer-based and array indexing extensively. Range-based for loops NOT applicable for: -- Pointer arithmetic (intentional) -- Reverse iteration (decrement loops) -- Conditional iteration (binary search, linked lists) -- Index-dependent operations (pointer arithmetic) - ---- - -## RECOMMENDATIONS - -### Tier 1: Hot-Path Micro-optimizations -```cpp -// lvm.cpp:808 - Use modern loop syntax -// BEFORE: -do { setnilvalue(s2v(ra++)); } while (b--); - -// AFTER: -while (b-- > 0) { setnilvalue(s2v(ra++)); } -// Or: -for (int j = 0; j < b; j++) { setnilvalue(s2v(ra++)); } -``` - -### Tier 2: Type Safety Improvements -```cpp -// funcstate.cpp:238 - Explicit cast -for (int i = static_cast(getNumActiveVars()) - 1; i >= 0; i--) - -// ltablib.cpp:87 - Consistent types -for (lua_Integer i = e; i > pos; i--) - -// lstring.cpp:73 - Explicit size type -for (size_t i = 0; i < osize; i++) -``` - -### Tier 3: Documentation -- Add comments to patterns using: - - `while (i--)` on unsigned - explain it's safe - - Pointer arithmetic loops - mark as intentional - - Post-decrement in conditions - explain specific intent - ---- - -## CONCLUSION - -The lua_cpp codebase demonstrates **excellent loop coding practices**: - -- ✓ **Type-safe patterns** used consistently -- ✓ **Hot-path loops** already optimized -- ✓ **Pointer arithmetic** correctly implemented -- ✓ **Binary search** properly coded -- ✓ **GC iteration** idiomatic and safe - -**Recommended Actions**: -1. Profile `lvm.cpp:808` for actual performance impact -2. Add explicit casts for clarity (Tier 2 items) -3. Document intentional patterns (comments) -4. No range-based for loops needed - current patterns are appropriate - -**Performance Impact**: Negligible to minimal (except VM micro-opt could save 1-2%) - diff --git a/docs/LTO_STATUS.md b/docs/LTO_STATUS.md deleted file mode 100644 index 6d4cef5d..00000000 --- a/docs/LTO_STATUS.md +++ /dev/null @@ -1,126 +0,0 @@ -# LTO (Link Time Optimization) Status - -## Current Status: **NOT WORKING** ❌ - -LTO support has been implemented in the build system but exposes serious bugs that prevent the test suite from passing. - -## Build System Changes - -### CMakeLists.txt -- Added `LUA_ENABLE_LTO` option (default: OFF) -- When enabled, sets `CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE` -- Adds `-fno-strict-aliasing` to handle type punning -- Adds `-ffat-lto-objects` to reduce LTO aggressiveness - -### Usage -```bash -cmake -B build -DCMAKE_BUILD_TYPE=Release -DLUA_ENABLE_LTO=ON -cmake --build build -``` - -## Issues Discovered - -### 1. Corrupted Type Values -**Symptom**: GC objects show invalid type values (e.g., `0xab` = 171) -**Location**: `GCCore::getgclist()` receives objects with corrupted type fields -**Failure**: Test suite crashes immediately with assertion failures - -### 2. Checkliveness Failures -**Symptom**: Assertions fail in `checkliveness()` after GC operations -**Cause**: Memory corruption or incorrect GC state - -### 3. Root Cause Analysis -LTO is exposing **undefined behavior** in the codebase: - -- **Strict Aliasing Violations**: Lua uses extensive type punning (same memory read as different types) -- **Uninitialized Memory**: Some code paths may read memory before initialization -- **Memory Lifetime Issues**: Objects accessed before construction or after destruction -- **GC Invariant Violations**: LTO's aggressive inlining/reordering breaks GC assumptions - -## Why LTO Breaks This Code - -### LTO Optimization Characteristics -1. **Whole Program Analysis**: Sees all code at once, makes global assumptions -2. **Aggressive Inlining**: Merges functions that normally wouldn't execute together -3. **Memory Reordering**: Can change memory layout and access patterns -4. **Strict Aliasing**: Assumes C++ aliasing rules (Lua violates these) -5. **UB Exploitation**: Uses undefined behavior for optimizations - -### Lua's C Heritage Issues -The codebase was converted from C to C++, but retains C patterns that violate C++ rules: -- Type punning through unions (technically UB in C++) -- Pointer casts that LTO treats as strict aliasing violations -- Memory layout assumptions that LTO can break - -## Code Changes Made - -### GC Core (src/memory/gc/gc_core.cpp) -Added handling for types that can appear in gray list: -- `LUA_VUPVAL`: Uses base GCObject `next` field for gray list linkage -- `LUA_VSHRSTR`/`LUA_VLNGSTR`: Added defensive fallback (strings shouldn't be gray) -- Default case: Returns base `next` pointer instead of asserting (prevents crash) - -### GC Weak (src/memory/gc/gc_weak.cpp) -Removed duplicate `getgclist()` implementation, now forwards to `GCCore::getgclist()` - -## Attempted Fixes (All Failed) - -1. ✗ Added `-fno-strict-aliasing` - Still crashes -2. ✗ Changed to `-ffat-lto-objects` - Still crashes -3. ✗ Added missing type handlers in `getgclist()` - Revealed deeper corruption -4. ✗ Defensive programming in GC code - Corruption too fundamental - -## Path Forward - -### Short Term: Disable LTO (Current State) -- Keep `LUA_ENABLE_LTO` option but default to OFF -- Document that LTO is experimental and broken -- Warn users in documentation - -### Long Term: Fix Underlying Issues -To make LTO work, need to eliminate ALL undefined behavior: - -1. **Audit Type Punning**: Replace C-style type punning with proper C++ patterns - - Use `std::bit_cast` (C++20) - - Use proper variant types - - Avoid pointer cast hackery - -2. **Fix Memory Initialization**: Ensure all objects fully initialized before use - - Constructor improvements - - Explicit zero-initialization - - Valgrind/MSAN audits - -3. **GC Invariant Enforcement**: Make GC state transitions explicit and verifiable - - Add more assertions - - State machine verification - - Sanitizer testing - -4. **Strict Aliasing Compliance**: Restructure code to follow C++ aliasing rules - - Eliminate type punning - - Use proper casts - - Mark aliasing with attributes - -### Estimated Effort -**High**: 40-80 hours of careful analysis and refactoring -**Risk**: High - touching GC code is dangerous -**Benefit**: Modest - LTO typically gives 5-15% performance improvement - -## Testing -Without LTO: ✅ All tests pass (`final OK !!!`) -With LTO: ❌ Immediate crash in test suite - -## Compiler Tested -- GCC 13.3.0 -- Linux 4.4.0 - -## Recommendation -**DO NOT enable LTO** until underlying undefined behavior is fixed. - -## References -- GCC LTO docs: https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html#index-flto -- Strict Aliasing: https://en.cppreference.com/w/c/language/object#Strict_aliasing -- UB in C++: https://en.cppreference.com/w/cpp/language/ub - ---- -**Last Updated**: 2025-11-21 -**Status**: LTO support attempted but currently broken due to undefined behavior diff --git a/docs/LUAALLOCATOR_README.md b/docs/LUAALLOCATOR_README.md deleted file mode 100644 index d6583e14..00000000 --- a/docs/LUAALLOCATOR_README.md +++ /dev/null @@ -1,301 +0,0 @@ -# LuaAllocator - Standard C++ Allocator for Lua Memory Management - -## Overview - -This document describes the `LuaAllocator` and `LuaVector` utilities that provide standard-conforming C++ containers integrated with Lua's memory management system. - -## Components - -### 1. LuaAllocator (src/memory/luaallocator.h) - -A fully standard-conforming C++17 allocator that uses Lua's memory management system. - -**Key Features:** -- Respects Lua's memory limits and GC accounting -- Triggers emergency GC on allocation failure -- Zero overhead compared to manual `luaM_*` calls -- Compatible with all standard containers (vector, deque, list, map, etc.) - -**Usage Example:** -```cpp -#include "luaallocator.h" -#include - -void example(lua_State* L) { - // Create a vector with Lua's allocator - std::vector> vec{LuaAllocator(L)}; - - vec.push_back(42); - vec.push_back(84); - - // Memory is automatically tracked by Lua's GC - // Vector is freed when it goes out of scope -} -``` - -**Technical Details:** -- Allocation uses `luaM_malloc_` (with GC accounting) -- Deallocation uses `luaM_free_` (with GC debt adjustment) -- Throws `std::bad_alloc` on allocation failure (after emergency GC) -- Fully rebindable for container element types - -### 2. LuaVector (src/memory/LuaVector.h) - -A convenient wrapper around `std::vector` with `LuaAllocator`. - -**Key Features:** -- RAII-based automatic memory management -- Standard vector interface -- Exception-safe -- Works with STL algorithms - -**Usage Example:** -```cpp -#include "LuaVector.h" - -void example(lua_State* L) { - LuaVector numbers(L); - - numbers.reserve(1000); - for (int i = 0; i < 1000; i++) { - numbers.push_back(i); - } - - // Access elements - int first = numbers[0]; - int last = numbers.back(); - - // Use with algorithms - std::sort(numbers.begin(), numbers.end()); -} -``` - -## Testing - -### Standalone Test (test_luaallocator.cpp) - -Comprehensive test suite demonstrating: -1. Basic allocation/deallocation -2. Vector growth and reallocation -3. Different types (primitives, structs) -4. Memory accounting -5. Exception safety - -**Run the test:** -```bash -./build/test_luaallocator -``` - -**Expected output:** -``` -=== LuaAllocator Test Suite === - -Test 1: Basic vector operations... PASSED -Test 2: Vector growth and reallocation... PASSED -Test 3: Different types (double, struct)... PASSED -Test 4: Memory accounting... PASSED -Test 5: Exception safety... PASSED - -=== All tests completed === -``` - -### Integrated Test (T.testvector) - -A test function integrated into Lua's test infrastructure (src/testing/ltests.cpp). - -**Usage from Lua:** -```lua -local T = require('testing') - --- Test with 1000 elements -local bytes_allocated = T.testvector(1000) -print("Memory allocated:", bytes_allocated, "bytes") -``` - -**Implementation:** -- Creates a `LuaVector` with n elements -- Verifies all elements are correct -- Measures memory allocation -- Returns bytes allocated - -## Integration Points - -### Where to Use - -**Good candidates for LuaVector:** -1. **Temporary arrays during compilation/parsing** - - Growing arrays that are built up then discarded - - Local buffers in compiler functions - -2. **Internal data structures** - - Non-GC managed helper structures - - Algorithm working buffers - -3. **New code development** - - Modern C++ approach with automatic memory management - - Exception-safe resource handling - -### Where NOT to Use - -**Avoid LuaVector for:** -1. **GC-managed objects** - - Table arrays, Proto fields, etc. - - Requires manual memory management for GC traversal - -2. **Hot-path VM code** - - lvm.cpp, ldo.cpp critical paths - - Benchmark first to verify no regression - -3. **Public API structures** - - C compatibility required - - Fixed ABI - -4. **Fixed-size stack arrays** - - Use native C arrays for small, fixed sizes - - `char buffer[256]` is more efficient - -## Performance - -**Characteristics:** -- Zero allocation overhead vs. manual `luaM_*` calls -- Inline accessor functions -- No vtable overhead (no virtual functions) -- GC integration maintains existing performance characteristics - -**Benchmarking:** -When using in performance-critical code: -1. Build in Release mode -2. Run benchmark: `for i in 1 2 3 4 5; do ./build/lua all.lua | grep "total time:"; done` -3. Verify performance ≤ 2.21s (≤1% regression from 2.17s baseline) - -## Future Opportunities - -### Potential Conversions - -**Parser/Compiler Structures:** -- `Dyndata::actvar` (Vardesc array) -- `Labellist` (goto/label lists) -- Temporary code generation buffers - -**Buffer Structures:** -- `Mbuffer` (character buffer) -- Lexer token buffers - -**Advantages:** -- Automatic cleanup (exception-safe) -- Bounds checking in debug mode -- Standard algorithms support -- Cleaner code - -**Considerations:** -- Requires careful testing -- Must benchmark for performance -- Need to verify GC integration - -## Examples - -### Example 1: Temporary Buffer - -```cpp -#include "LuaVector.h" - -void processData(lua_State* L, const char* input, size_t len) { - // Create temporary buffer - LuaVector buffer(L); - buffer.reserve(len * 2); // Reserve space - - // Process input - for (size_t i = 0; i < len; i++) { - buffer.push_back(input[i]); - if (input[i] == '\n') { - buffer.push_back('\r'); // Add carriage return - } - } - - // Use buffer... - processBuffer(buffer.data(), buffer.size()); - - // Automatic cleanup when buffer goes out of scope -} -``` - -### Example 2: Building an Array - -```cpp -#include "LuaVector.h" - -Proto* generateCode(lua_State* L, /* ... */) { - LuaVector code(L); - - // Build code incrementally - code.push_back(CREATE_ABCk(OP_LOADK, 0, 0, 0)); - code.push_back(CREATE_ABC(OP_RETURN, 0, 1, 0)); - - // Allocate Proto and copy - Proto* p = luaF_newproto(L); - p->getCodeRef() = /* copy from code.data() */; - - return p; -} -``` - -### Example 3: With Algorithms - -```cpp -#include "LuaVector.h" -#include - -void sortAndUnique(lua_State* L, int* data, size_t n) { - // Copy to LuaVector - LuaVector vec(L); - vec.assign(data, data + n); - - // Use STL algorithms - std::sort(vec.begin(), vec.end()); - auto last = std::unique(vec.begin(), vec.end()); - vec.erase(last, vec.end()); - - // Results in vec - for (int value : vec) { - process(value); - } -} -``` - -## Implementation Notes - -### Memory Accounting - -All allocations through `LuaAllocator` are tracked by Lua's GC: -- Allocation: `GCdebt` is decreased -- Deallocation: `GCdebt` is increased -- Emergency GC triggered on allocation failure - -### Exception Safety - -The allocator provides strong exception safety: -- On allocation failure, triggers emergency GC -- If GC doesn't free enough memory, throws `std::bad_alloc` -- RAII ensures cleanup even with exceptions - -### Rebinding - -The allocator supports rebinding for internal container use: -```cpp -// Container can rebind to different types -std::vector> vec1(LuaAllocator(L)); - -// Internally, vector may create LuaAllocator -// This works because LuaAllocator is properly rebindable -``` - -## Conclusion - -`LuaAllocator` and `LuaVector` provide a modern C++ approach to memory management in the Lua codebase while maintaining full integration with Lua's GC system. They offer: - -- **Safety**: RAII, exception-safe, bounds checking -- **Convenience**: Standard container interface -- **Performance**: Zero overhead vs. manual memory management -- **Integration**: Full GC accounting and memory limits - -Use them for new code and consider them for refactoring opportunities in non-critical paths. diff --git a/docs/LUASTACK_AGGRESSIVE_PLAN.md b/docs/LUASTACK_AGGRESSIVE_PLAN.md deleted file mode 100644 index 861cb6ec..00000000 --- a/docs/LUASTACK_AGGRESSIVE_PLAN.md +++ /dev/null @@ -1,501 +0,0 @@ -# ✅ HISTORICAL - LuaStack Aggressive Centralization Plan (COMPLETED) - -**Status**: ✅ **COMPLETE** - Phase 94 finished (96 sites converted) -**Completion Date**: November 17, 2025 -**Result**: Complete stack encapsulation, all operations through LuaStack class - ---- - -# LuaStack Aggressive Centralization Plan - -**Date**: 2025-11-17 -**Original Status**: Planning Phase - AGGRESSIVE APPROACH -**Goal**: Move ALL stack responsibilities into LuaStack class - -## Philosophy Change - -**OLD (Conservative)**: Keep operations where they are, add convenience methods - -**NEW (Aggressive)**: LuaStack owns ALL stack operations - move everything, delete old code, update all call sites - -## Core Principle - -**LuaStack is THE stack authority**. If it touches `top`, `stack`, `stack_last`, or `tbclist`, it belongs in LuaStack. - -## Complete Inventory of Stack Operations - -### Category 1: Direct Pointer Manipulation -- `top.p++` - **60+ occurrences** → LuaStack::push() -- `top.p--` - **20+ occurrences** → LuaStack::pop() -- `top.p = value` - **40+ occurrences** → LuaStack::setTopPtr() -- `top.p += n` / `top.p -= n` - **10+ occurrences** → LuaStack::adjust() - -### Category 2: Index/Access Functions (lapi.cpp) -- `index2value()` - **40+ occurrences** → LuaStack::indexToValue() -- `index2stack()` - **10+ occurrences** → LuaStack::indexToStack() - -### Category 3: API Macros (lapi.h) -- `api_incr_top` - **20+ occurrences** → LuaStack::pushChecked() -- `api_checknelems` - **15+ occurrences** → LuaStack::checkHasElements() -- `api_checkpop` - **10+ occurrences** → LuaStack::checkCanPop() - -### Category 4: Stack Checking (ldo.h) -- `luaD_checkstack()` - **15+ occurrences** → LuaStack::ensureSpace() -- `checkstackp` - **5+ occurrences** → LuaStack::ensureSpaceP() - -### Category 5: Assignment Operations (lgc.h) -- `setobj2s()` - **30+ occurrences** → LuaStack::setSlot() -- `setobjs2s()` - **10+ occurrences** → LuaStack::copySlot() - -### Category 6: Stack Queries -- `stack_last.p - top.p` - **5+ occurrences** → LuaStack::getAvailable() -- `top.p - stack.p` - **5+ occurrences** → LuaStack::getDepth() -- `top.p - ci->funcRef().p` - **20+ occurrences** → LuaStack::getDepthFromFunc() - -**TOTAL: 250+ call sites to migrate** - -## New LuaStack Class Design - -### Full Method Suite - -```cpp -class LuaStack { -private: - StkIdRel top; /* first free slot in the stack */ - StkIdRel stack_last; /* end of stack (last element + 1) */ - StkIdRel stack; /* stack base */ - StkIdRel tbclist; /* list of to-be-closed variables */ - -public: - // ============================================================ - // BASIC MANIPULATION - // ============================================================ - - /* Push one slot (increment top) */ - inline void push() noexcept { - top.p++; - } - - /* Pop one slot (decrement top) */ - inline void pop() noexcept { - top.p--; - } - - /* Pop n slots */ - inline void popN(int n) noexcept { - top.p -= n; - } - - /* Adjust top by n (positive or negative) */ - inline void adjust(int n) noexcept { - top.p += n; - } - - /* Set top to specific pointer */ - inline void setTopPtr(StkId ptr) noexcept { - top.p = ptr; - } - - /* Set top to specific offset from stack base */ - inline void setTopOffset(int offset) noexcept { - top.p = stack.p + offset; - } - - // ============================================================ - // API OPERATIONS (with bounds checking) - // ============================================================ - - /* Push with bounds check (replaces api_incr_top) */ - inline void pushChecked(StkId limit) noexcept { - top.p++; - lua_assert(top.p <= limit); - } - - /* Check if stack has at least n elements (replaces api_checknelems) */ - inline bool checkHasElements(CallInfo* ci, int n) const noexcept { - return (n) < (top.p - ci->funcRef().p); - } - - /* Check if n elements can be popped (replaces api_checkpop) */ - inline bool checkCanPop(CallInfo* ci, int n) const noexcept { - return (n) < top.p - ci->funcRef().p && - tbclist.p < top.p - n; - } - - // ============================================================ - // INDEX CONVERSION (from lapi.cpp) - // ============================================================ - - /* Convert API index to TValue* (replaces index2value) */ - TValue* indexToValue(lua_State* L, int idx); - - /* Convert API index to StkId (replaces index2stack) */ - StkId indexToStack(lua_State* L, int idx); - - // ============================================================ - // SPACE CHECKING (from ldo.h) - // ============================================================ - - /* Ensure space for n elements (replaces luaD_checkstack) */ - inline int ensureSpace(lua_State* L, int n) { - if (l_unlikely(stack_last.p - top.p <= n)) { - return grow(L, n, 1); - } -#if defined(HARDSTACKTESTS) - else { - int sz = getSize(); - realloc(L, sz, 0); - } -#endif - return 1; - } - - /* Ensure space preserving pointer (replaces checkstackp) */ - template - inline T* ensureSpaceP(lua_State* L, int n, T* ptr) { - if (l_unlikely(stack_last.p - top.p <= n)) { - ptrdiff_t offset = save(reinterpret_cast(ptr)); - grow(L, n, 1); - return reinterpret_cast(restore(offset)); - } -#if defined(HARDSTACKTESTS) - else { - ptrdiff_t offset = save(reinterpret_cast(ptr)); - int sz = getSize(); - realloc(L, sz, 0); - return reinterpret_cast(restore(offset)); - } -#endif - return ptr; - } - - // ============================================================ - // ASSIGNMENT OPERATIONS (from lgc.h) - // ============================================================ - - /* Assign to stack slot from TValue (replaces setobj2s) */ - inline void setSlot(lua_State* L, StackValue* dest, const TValue* src) noexcept { - setobj(L, s2v(dest), src); - } - - /* Copy between stack slots (replaces setobjs2s) */ - inline void copySlot(lua_State* L, StackValue* dest, StackValue* src) noexcept { - setobj(L, s2v(dest), s2v(src)); - } - - /* Set slot to nil */ - inline void setNil(StackValue* slot) noexcept { - setnilvalue(s2v(slot)); - } - - // ============================================================ - // QUERIES - // ============================================================ - - /* Available space before stack_last */ - inline int getAvailable() const noexcept { - return cast_int(stack_last.p - top.p); - } - - /* Current depth (elements from base to top) */ - inline int getDepth() const noexcept { - return cast_int(top.p - stack.p); - } - - /* Depth relative to function base */ - inline int getDepthFromFunc(CallInfo* ci) const noexcept { - return cast_int(top.p - (ci->funcRef().p + 1)); - } - - /* Check if can fit n elements */ - inline bool canFit(int n) const noexcept { - return stack_last.p - top.p > n; - } - - // ============================================================ - // ELEMENT ACCESS - // ============================================================ - - /* Get TValue at absolute offset from stack base */ - inline TValue* at(int offset) noexcept { - lua_assert(offset >= 0 && stack.p + offset < top.p); - return s2v(stack.p + offset); - } - - /* Get TValue at offset from top (-1 = top element) */ - inline TValue* fromTop(int offset) noexcept { - lua_assert(offset <= 0 && top.p + offset >= stack.p); - return s2v(top.p + offset); - } - - /* Get top-most TValue (top - 1) */ - inline TValue* topValue() noexcept { - lua_assert(top.p > stack.p); - return s2v(top.p - 1); - } - - // ... existing methods (init, free, grow, realloc, etc.) -}; -``` - -## Implementation Phases - -### Phase 94.1: Add ALL Methods to LuaStack (4-6 hours) - -**Add to lstack.h**: -1. Basic manipulation: push(), pop(), popN(), adjust(), setTopPtr(), setTopOffset() -2. API operations: pushChecked(), checkHasElements(), checkCanPop() -3. Space checking: ensureSpace(), ensureSpaceP() -4. Assignment: setSlot(), copySlot(), setNil() -5. Queries: getAvailable(), getDepth(), getDepthFromFunc(), canFit() -6. Element access: at(), fromTop(), topValue() - -**Add to lstack.cpp**: -1. Move index2value() → indexToValue() implementation -2. Move index2stack() → indexToStack() implementation - -**Build and test**: Ensure zero errors - -**Commit**: "Phase 94.1: Add complete method suite to LuaStack" - -### Phase 94.2: Convert lapi.cpp (3-4 hours) - -**Replace ~40 call sites**: -- `index2value(L, idx)` → `L->getStackSubsystem().indexToValue(L, idx)` -- `index2stack(L, idx)` → `L->getStackSubsystem().indexToStack(L, idx)` -- `api_incr_top(L)` → `L->getStackSubsystem().pushChecked(L->getCI()->topRef().p)` - -**Test after every 10 conversions** - -**Commit**: "Phase 94.2: Convert lapi.cpp to use LuaStack methods" - -### Phase 94.3: Convert API Macros to Inline Functions (2-3 hours) - -**In lapi.h**: -```cpp -// OLD (DELETE): -#define api_incr_top(L) ... -#define api_checknelems(L,n) ... -#define api_checkpop(L,n) ... - -// NEW: -inline void api_incr_top(lua_State* L) noexcept { - L->getStackSubsystem().pushChecked(L->getCI()->topRef().p); -} - -inline void api_check_nelems(lua_State* L, int n) noexcept { - api_check(L, L->getStackSubsystem().checkHasElements(L->getCI(), n), - "not enough elements in the stack"); -} - -inline void api_check_pop(lua_State* L, int n) noexcept { - api_check(L, L->getStackSubsystem().checkCanPop(L->getCI(), n), - "not enough free elements in the stack"); -} -``` - -**Update all call sites** (~45 occurrences) - -**Commit**: "Phase 94.3: Convert API macros to LuaStack methods" - -### Phase 94.4: Convert Stack Checking (2-3 hours) - -**In ldo.h**: -```cpp -// OLD (DELETE): -// #define luaD_checkstackaux(L,n,pre,pos) ... -// inline void luaD_checkstack(lua_State* L, int n) ... -// #define checkstackp(L,n,p) ... - -// NEW (keep as thin wrappers): -inline void luaD_checkstack(lua_State* L, int n) noexcept { - L->getStackSubsystem().ensureSpace(L, n); -} - -#define checkstackp(L,n,p) \ - (p = L->getStackSubsystem().ensureSpaceP(L, n, p)) -``` - -**Update ~15 call sites** to use LuaStack methods directly where possible - -**Commit**: "Phase 94.4: Simplify stack checking to use LuaStack" - -### Phase 94.5: Convert Assignment Operations (3-4 hours) - -**In lgc.h**: -```cpp -// OLD (DELETE): -// inline void setobj2s(...) { ... } -// inline void setobjs2s(...) { ... } - -// NEW (thin wrappers for compatibility): -inline void setobj2s(lua_State* L, StackValue* o1, const TValue* o2) noexcept { - L->getStackSubsystem().setSlot(L, o1, o2); -} - -inline void setobjs2s(lua_State* L, StackValue* o1, StackValue* o2) noexcept { - L->getStackSubsystem().copySlot(L, o1, o2); -} -``` - -**Update ~40 call sites** to use LuaStack methods directly - -**Commit**: "Phase 94.5: Move assignment operations to LuaStack" - -### Phase 94.6: Mass Migration - Direct Pointer Ops (10-15 hours) - -**Batch 1: lapi.cpp** (~20 sites) -- `L->getTop().p++` → `L->getStackSubsystem().push()` -- `L->getTop().p--` → `L->getStackSubsystem().pop()` -- `L->getTop().p = x` → `L->getStackSubsystem().setTopPtr(x)` - -**Batch 2: ldo.cpp** (~15 sites) - -**Batch 3: ldebug.cpp** (~10 sites) - -**Batch 4: ltm.cpp** (~8 sites) - -**Batch 5: lvm.cpp PART 1** (~20 sites - non-critical paths) - -**Batch 6: lvm.cpp PART 2** (~30 sites - VM interpreter core) - -**Batch 7: Other files** (~50+ sites - compiler, libraries, GC, etc.) - -**Strategy**: -- Convert 10-20 sites at a time -- Build and test after each batch -- Benchmark after each major file -- Revert if excessive regression (>3%) - -**Commits**: One commit per batch: "Phase 94.6.X: Convert [file] to LuaStack methods" - -### Phase 94.7: Remove Old Code (1-2 hours) - -**Delete from lapi.cpp**: -- static TValue* index2value() - replaced by LuaStack::indexToValue() -- static StkId index2stack() - replaced by LuaStack::indexToStack() - -**Delete from lapi.h**: -- #define api_incr_top -- #define api_checknelems -- #define api_checkpop -(Keep thin inline wrappers if needed for compatibility) - -**Delete from ldo.h**: -- #define luaD_checkstackaux -(Keep luaD_checkstack wrapper for external compatibility) - -**Delete from lgc.h**: -- OLD implementations if fully migrated -(Keep wrappers if external code depends on them) - -**Commit**: "Phase 94.7: Remove deprecated stack operation code" - -### Phase 94.8: Final Cleanup & Documentation (1-2 hours) - -**Update CLAUDE.md**: -- Document LuaStack complete API -- Update macro conversion stats -- Note all stack operations now in LuaStack - -**Add comments to lstack.h**: -- Group methods by category -- Document each public method -- Note which old functions they replace - -**Commit**: "Phase 94.8: Document complete LuaStack API" - -## Migration Statistics - -**Total Conversions**: -- Direct pointer ops: ~130 sites -- index2value/index2stack: ~50 sites -- API macros: ~45 sites -- Stack checking: ~20 sites -- Assignments: ~40 sites -- **GRAND TOTAL: ~285 call sites** - -**Code Deletions**: -- 2 static functions (index2value, index2stack) -- 5+ macros (api_incr_top, api_checknelems, api_checkpop, luaD_checkstackaux, checkstackp) -- 2 inline functions (setobj2s, setobjs2s) - replaced by methods - -**Code Additions**: -- ~25 new LuaStack methods -- Full encapsulation of stack operations - -## Performance Strategy - -**Critical Points**: -1. **All methods are inline** - zero function call overhead -2. **No virtual dispatch** - no vtables -3. **Same generated code** - just cleaner source -4. **Benchmark after every phase** - catch regressions early -5. **VM hot paths last** - prove performance on easier code first - -**Acceptance Criteria**: -- Performance ≤4.33s (≤3% from 4.20s baseline) -- If ANY phase exceeds target, investigate and optimize -- Consider keeping some direct .p access if proven necessary - -## Estimated Timeline - -| Phase | Description | Hours | Risk | -|-------|-------------|-------|------| -| 94.1 | Add all methods | 4-6 | LOW | -| 94.2 | Convert lapi.cpp | 3-4 | LOW | -| 94.3 | Convert API macros | 2-3 | LOW | -| 94.4 | Convert stack checking | 2-3 | LOW-MEDIUM | -| 94.5 | Convert assignments | 3-4 | LOW-MEDIUM | -| 94.6 | Mass migration | 10-15 | MEDIUM | -| 94.7 | Remove old code | 1-2 | LOW | -| 94.8 | Documentation | 1-2 | LOW | -| **TOTAL** | **End-to-end** | **27-40** | **MEDIUM** | - -## Risk Mitigation - -1. **Incremental approach** - Small batches with frequent testing -2. **Benchmark early and often** - Catch performance issues immediately -3. **Commit frequently** - Easy rollback if needed -4. **VM code last** - Prove approach on easier code first -5. **Keep wrappers initially** - Easier migration, remove later - -## Success Criteria - -- ✅ ALL stack operations in LuaStack class -- ✅ Zero macros for stack operations (all inline functions) -- ✅ index2value/index2stack deleted from lapi.cpp -- ✅ setobj2s/setobjs2s delegating to LuaStack -- ✅ Performance ≤4.33s (≤3% from baseline) -- ✅ All tests pass (final OK !!!) -- ✅ Zero build warnings -- ✅ Complete stack encapsulation - -## Differences from Conservative Plan - -| Aspect | Conservative | Aggressive | -|--------|--------------|------------| -| index2value/index2stack | Keep in lapi.cpp | Move to LuaStack | -| setobj2s/setobjs2s | Keep in lgc.h | Wrap in LuaStack | -| API macros | Keep as macros | Convert to inline functions | -| Direct .p access | Keep in hot paths | Convert ALL sites | -| Migration scope | Selective (~50 sites) | Complete (~285 sites) | -| Old code | Keep wrappers | Delete after migration | -| Timeline | 5-7 hours | 27-40 hours | - -## Conclusion - -This aggressive plan achieves **complete stack encapsulation**: - -1. **Single source of truth** - All stack ops in LuaStack -2. **Zero macros** - All inline functions for type safety -3. **Clean deletion** - Old code removed after migration -4. **Better maintainability** - One place to look for stack logic -5. **Same performance** - All inline, zero overhead - -**Next Step**: Begin Phase 94.1 - Add complete method suite to LuaStack - ---- - -**Last Updated**: 2025-11-17 -**Status**: Ready for implementation diff --git a/docs/LUASTACK_ASSIGNMENT_PLAN.md b/docs/LUASTACK_ASSIGNMENT_PLAN.md deleted file mode 100644 index c4d46959..00000000 --- a/docs/LUASTACK_ASSIGNMENT_PLAN.md +++ /dev/null @@ -1,456 +0,0 @@ -# ✅ HISTORICAL - LuaStack Assignment & Manipulation Plan (COMPLETED) - -**Status**: ✅ **COMPLETE** - All stack assignments centralized -**Completion Date**: November 2025 -**Result**: Stack assignment operations fully encapsulated in LuaStack - ---- - -# LuaStack Assignment & Manipulation - Integration Plan - -**Date**: 2025-11-17 -**Original Status**: Planning Phase -**Context**: Phase 93 completed - LuaStack class created with basic stack management - -## Executive Summary - -This plan details how to integrate **stack assignment, manipulation, and access operations** into the LuaStack class. Currently, stack operations are scattered across multiple files (lapi.cpp, ldo.h, lgc.h). Centralizing these in LuaStack improves encapsulation and follows the Single Responsibility Principle established in Phase 93. - -## Current State Analysis - -### 1. Stack Index/Access Functions (lapi.cpp) - -**Location**: `src/core/lapi.cpp` (static functions) - -```cpp -// Convert API index to TValue* (handles positive, negative, pseudo-indices) -static TValue *index2value (lua_State *L, int idx); - -// Convert valid actual index to stack pointer -static StkId index2stack (lua_State *L, int idx); -``` - -**Usage**: ~40+ calls throughout lapi.cpp -**Purpose**: Convert Lua API indices (1-based, negative offsets) to internal stack pointers -**Dependencies**: CallInfo (for func position), global_State (for registry, nilvalue) - -**Analysis**: -- These are **tightly coupled to lua_State and CallInfo** (need ci->func, ci->top) -- Not pure stack operations - involve registry, upvalues, pseudo-indices -- **Decision**: Keep in lapi.cpp, do NOT move to LuaStack (too much coupling) - -### 2. Stack Manipulation Macros (lapi.h) - -**Location**: `src/core/lapi.h` - -```cpp -// Increment top with overflow check -#define api_incr_top(L) \ - (L->getTop().p++, api_check(L, L->getTop().p <= L->getCI()->topRef().p, "stack overflow")) - -// Check stack has at least n elements -#define api_checknelems(L,n) \ - api_check(L, (n) < (L->getTop().p - L->getCI()->funcRef().p), "not enough elements") - -// Check stack has n elements to pop (considers to-be-closed vars) -#define api_checkpop(L,n) \ - api_check(L, (n) < L->getTop().p - L->getCI()->funcRef().p && \ - L->getTbclist().p < L->getTop().p - (n), "not enough free elements") -``` - -**Usage**: Throughout lapi.cpp for API validation -**Purpose**: API boundary checks and assertions - -**Analysis**: -- **api_incr_top**: Simple top++ with assertion - could be LuaStack method -- **api_checknelems/api_checkpop**: Require CallInfo state - keep as lua_State helpers -- **Decision**: Add pushChecked() method to LuaStack, keep validation macros in lapi.h - -### 3. Stack Checking Functions (ldo.h) - -**Location**: `src/core/ldo.h` - -```cpp -// Ensure stack has space for n more elements -inline void luaD_checkstack(lua_State* L, int n) noexcept { - if (l_unlikely(L->getStackLast().p - L->getTop().p <= n)) { - L->growStack(n, 1); - } -} - -// Check stack with save/restore (macro) -#define luaD_checkstackaux(L,n,pre,pos) \ - if (l_unlikely(L->getStackLast().p - L->getTop().p <= (n))) \ - { pre; (L)->growStack(n, 1); pos; } \ - else { condmovestack(L,pre,pos); } - -// Check stack preserving pointer p -#define checkstackp(L,n,p) \ - luaD_checkstackaux(L, n, \ - ptrdiff_t t__ = L->saveStack(p), \ - p = L->restoreStack(t__)) -``` - -**Usage**: ~15+ calls throughout VM, compiler, API -**Purpose**: Ensure stack space before operations - -**Analysis**: -- **luaD_checkstack**: Delegates to L->growStack() - natural fit for LuaStack -- **checkstackp**: Uses save/restore already in LuaStack -- **Decision**: Move ensureSpace() method to LuaStack, keep C API wrappers - -### 4. Direct Stack Pointer Operations (scattered) - -**Locations**: lapi.cpp, ldo.cpp, lvm.cpp, ldebug.cpp, ltm.cpp, and more - -```cpp -// Increment top (40+ occurrences) -L->getTop().p++; -top.p++; - -// Decrement top (20+ occurrences) -L->getTop().p--; -top.p--; - -// Set top directly (40+ occurrences) -L->getTop().p = newvalue; -top.p = restore(offset); - -// Pointer arithmetic -int size = cast_int(top.p - stack.p); -bool hasSpace = stack_last.p - top.p > n; -``` - -**Usage**: Very frequent (100+ occurrences) -**Purpose**: Direct stack manipulation - -**Analysis**: -- **Most are in VM hot paths** (lvm.cpp) - performance critical -- Current accessor pattern (getTop().p++) is already efficient -- Adding method wrappers might harm readability without benefit -- **Decision**: - - Add **convenience methods** for common patterns (push(), pop(), setTop()) - - Keep direct .p access for hot paths and complex operations - - Gradually migrate non-hot-path code to methods - -### 5. Stack Assignment Operations (lgc.h) - -**Location**: `src/memory/lgc.h` - -```cpp -// Assign to stack from TValue (with GC barrier check) -inline void setobj2s(lua_State* L, StackValue* o1, const TValue* o2) noexcept { - setobj(L, s2v(o1), o2); -} - -// Assign stack to stack -inline void setobjs2s(lua_State* L, StackValue* o1, StackValue* o2) noexcept { - setobj(L, s2v(o1), s2v(o2)); -} -``` - -**Usage**: ~30 occurrences across VM, GC, API -**Purpose**: Stack value assignment with GC awareness - -**Analysis**: -- **Involves GC barriers** (black→white checks) -- Calls setobj() which is in GC system -- **Not pure stack operations** - GC integration -- **Decision**: Keep in lgc.h (GC responsibility), do NOT move to LuaStack - -## Proposed LuaStack Additions - -### Phase 94.1: Stack Manipulation Methods - -Add convenience methods for common operations: - -```cpp -class LuaStack { -public: - // ... existing members ... - - /* - ** Stack pointer manipulation methods - */ - - /* Increment top pointer (assumes space checked) */ - inline void push() noexcept { - top.p++; - } - - /* Decrement top pointer */ - inline void pop() noexcept { - top.p--; - } - - /* Pop n elements from stack */ - inline void popN(int n) noexcept { - top.p -= n; - } - - /* Set top to specific pointer value */ - inline void setTopPtr(StkId newTop) noexcept { - top.p = newTop; - } - - /* Increment top with bounds check (for API) */ - inline void pushChecked(lua_State* L, StkId limit) noexcept { - top.p++; - lua_assert(top.p <= limit); // In debug builds - } - - /* Get distance from stack base to top (in elements) */ - inline int getDepth() const noexcept { - return cast_int(top.p - stack.p); - } - - /* Get available space (how many elements fit before stack_last) */ - inline int getAvailable() const noexcept { - return cast_int(stack_last.p - top.p); - } - - /* Check if there is space for n elements (does not grow) */ - inline bool canFit(int n) const noexcept { - return stack_last.p - top.p > n; - } - - /* Ensure space for n elements (grows if needed) */ - int ensureSpace(lua_State* L, int n, int raiseerror = 1); -}; -``` - -**Benefits**: -- Clearer intent (push() vs top.p++) -- Centralized bounds checking (pushChecked) -- Easier to add instrumentation/debugging later -- Still inline = zero overhead - -**Migration Strategy**: -- Add methods to LuaStack -- Gradually convert call sites (non-hot paths first) -- Keep direct .p access in VM hot paths -- Benchmark after each batch - -### Phase 94.2: Stack Checking Integration - -Move luaD_checkstack logic into LuaStack: - -```cpp -// In lstack.h -class LuaStack { -public: - /* Ensure space for n elements, growing stack if necessary */ - inline int ensureSpace(lua_State* L, int n, int raiseerror = 1) { - if (l_unlikely(stack_last.p - top.p <= n)) { - return grow(L, n, raiseerror); - } -#if defined(HARDSTACKTESTS) - else { - int sz = getSize(); - realloc(L, sz, 0); // Test stack movement - } -#endif - return 1; // Success - } - - /* Ensure space preserving a pointer (returns new pointer) */ - template - inline T* ensureSpaceP(lua_State* L, int n, T* ptr) { - if (l_unlikely(stack_last.p - top.p <= n)) { - ptrdiff_t offset = save(reinterpret_cast(ptr)); - grow(L, n, 1); - return reinterpret_cast(restore(offset)); - } -#if defined(HARDSTACKTESTS) - else { - ptrdiff_t offset = save(reinterpret_cast(ptr)); - int sz = getSize(); - realloc(L, sz, 0); - return reinterpret_cast(restore(offset)); - } -#endif - return ptr; - } -}; - -// In ldo.h - keep C API wrappers -inline void luaD_checkstack(lua_State* L, int n) noexcept { - L->getStackSubsystem().ensureSpace(L, n); -} - -#define checkstackp(L,n,p) \ - (p = L->getStackSubsystem().ensureSpaceP(L, n, p)) -``` - -**Benefits**: -- Stack checking logic lives in LuaStack -- Maintains existing API -- Template method for type-safe pointer preservation - -### Phase 94.3: Stack Access Helpers (Optional) - -Add optional convenience methods for stack element access: - -```cpp -class LuaStack { -public: - /* Get TValue at offset from stack base (0-indexed) */ - inline TValue* at(int offset) noexcept { - lua_assert(offset >= 0 && stack.p + offset < top.p); - return s2v(stack.p + offset); - } - - /* Get TValue at offset from top (-1 = top-most element) */ - inline TValue* fromTop(int offset) noexcept { - lua_assert(offset < 0 && top.p + offset >= stack.p); - return s2v(top.p + offset); - } - - /* Get pointer to top-most element (top - 1) */ - inline TValue* topValue() noexcept { - lua_assert(top.p > stack.p); - return s2v(top.p - 1); - } -}; -``` - -**Analysis**: -- Nice for readability: `stack.at(5)` vs `s2v(L->getStack().p + 5)` -- But adds another layer of indirection conceptually -- **Decision**: Optional phase, evaluate need after Phase 94.1-94.2 - -## What NOT to Move - -### Keep in Current Locations - -1. **index2value() / index2stack()** (lapi.cpp) - - Reason: Tightly coupled to CallInfo, pseudo-indices, registry - - Not pure stack operations - -2. **setobj2s() / setobjs2s()** (lgc.h) - - Reason: Part of GC barrier system - - Stack is just the destination, GC is the concern - -3. **api_checknelems / api_checkpop** (lapi.h) - - Reason: API-specific validation involving CallInfo - - Keep at API boundary - -4. **Most direct .p access in VM** (lvm.cpp) - - Reason: Hot path performance, complex expressions - - Keep direct access for clarity and performance - -## Implementation Phases - -### Phase 94.1: Basic Manipulation Methods ⏳ - -**Estimated Time**: 2-3 hours -**Risk**: LOW - -1. Add methods to LuaStack class (lstack.h): - - push(), pop(), popN() - - setTopPtr() - - getDepth(), getAvailable(), canFit() - -2. Build and test (ensure zero errors) - -3. Benchmark (ensure ≤4.33s) - -4. Commit: "Phase 94.1: Add stack manipulation methods to LuaStack" - -### Phase 94.2: Stack Space Checking ⏳ - -**Estimated Time**: 3-4 hours -**Risk**: LOW-MEDIUM - -1. Add ensureSpace() method to LuaStack - -2. Add ensureSpaceP() template method - -3. Update luaD_checkstack() in ldo.h to use ensureSpace() - -4. Update checkstackp macro to use ensureSpaceP() - -5. Build and test - -6. Benchmark (critical - stack checking is on hot paths) - -7. Commit: "Phase 94.2: Move stack checking to LuaStack::ensureSpace()" - -### Phase 94.3: Gradual Migration (Optional) ⏳ - -**Estimated Time**: 10-15 hours -**Risk**: LOW - -1. Identify non-hot-path top.p++ → stack.push() - -2. Identify non-hot-path top.p-- → stack.pop() - -3. Convert in batches of 10-20 sites - -4. Build, test, benchmark after each batch - -5. Commit after each batch - -**Note**: This phase is optional and should be evaluated based on actual readability/maintainability benefits vs. migration cost. - -## Performance Constraints - -**Critical**: Maintain performance target ≤4.33s (≤3% from 4.20s baseline) - -**Hot Paths to Watch**: -- lvm.cpp (bytecode interpreter) - most critical -- ldo.cpp (call/return) - very critical -- luaD_checkstack() - called frequently -- Stack pointer arithmetic in VM - -**Mitigation**: -- All new methods are inline -- No virtual dispatch (no vtable overhead) -- Keep direct .p access where needed -- Benchmark after every phase - -## Success Criteria - -- ✅ All methods inline (zero-cost abstraction) -- ✅ Performance ≤4.33s (≤3% regression) -- ✅ All tests pass (final OK !!!) -- ✅ Zero build warnings -- ✅ Improved code clarity (where methods used) -- ✅ No C API breakage - -## Open Questions - -1. **Should we migrate all top.p++ to push()?** - - Pro: Clearer intent, easier debugging - - Con: 100+ call sites, limited benefit in hot paths - - **Recommendation**: Migrate selectively (non-hot paths) - -2. **Should we add at()/fromTop() helpers?** - - Pro: More readable - - Con: Adds conceptual layer - - **Recommendation**: Defer until proven need - -3. **Should we wrap all stack operations eventually?** - - Pro: Complete encapsulation - - Con: May hurt VM readability - - **Recommendation**: Pragmatic approach - wrap where it helps - -## Related Work - -- **Phase 93**: LuaStack class creation (COMPLETED ✅) -- **Phase 90**: FuncState SRP refactoring -- **Phase 91**: global_State SRP refactoring -- **Phase 92**: Proto SRP refactoring - -## Conclusion - -**Recommendation**: Proceed with Phase 94.1 and 94.2 - -These phases add useful convenience methods without major code churn, maintain performance, and improve LuaStack's API. Phase 94.3 (migration) should be evaluated later based on demonstrated value. - -**Key Principle**: LuaStack should own **stack structure operations** (grow, shrink, check space, push/pop). It should NOT own **GC operations** (barriers) or **API operations** (index translation, validation). - ---- - -**Last Updated**: 2025-11-17 -**Next Step**: Review plan with user, then proceed with Phase 94.1 diff --git a/docs/MEMORY_ALLOCATION_ARCHITECTURE.md b/docs/MEMORY_ALLOCATION_ARCHITECTURE.md deleted file mode 100644 index 90f21790..00000000 --- a/docs/MEMORY_ALLOCATION_ARCHITECTURE.md +++ /dev/null @@ -1,744 +0,0 @@ -# Lua Memory Allocation Architecture - Comprehensive Documentation - -## Overview - -Lua's memory allocation system is built on a customizable allocator interface that allows applications to completely control how memory is allocated, deallocated, and tracked. The system is tightly integrated with garbage collection (GC) to manage memory automatically while respecting the custom allocator. - -**Key Characteristics:** -- Single global allocator function pointer per Lua state -- Unified interface for allocation, reallocation, and deallocation -- Integrated GC debt tracking for memory accounting -- Emergency collection mechanism for allocation failures -- Zero required alignment constraints (allocator handles it) -- Support for custom user data pointer (`ud`) - ---- - -## 1. Allocator Interface - -### 1.1 Type Definition - -```c -/* Type for memory-allocation functions */ -typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); -``` - -**Location:** `/home/user/lua_cpp/include/lua.h` (line 161) - -**Signature Semantics:** -- **`ud`** (user data): Arbitrary pointer stored in global_State, passed to every allocator call -- **`ptr`** (pointer to block): The memory block being allocated/reallocated/freed - - `NULL` when allocating new memory - - Valid pointer when reallocating or freeing -- **`osize`** (old size): Previous size of the block - - `0` when allocating new memory - - Size in bytes when reallocating or freeing - - Must satisfy: `(osize == 0) == (ptr == NULL)` (invariant checked) -- **`nsize`** (new size): Desired new size - - `> 0` for allocation/reallocation (allocator must return valid pointer or NULL) - - `0` for deallocation (must always succeed; freeing NULL is allowed) - - Relationship: `nsize > 0` means allocate/reallocate, `nsize == 0` means free - -**Return Value:** -- New memory block pointer if successful (must be non-NULL for nsize > 0) -- NULL on failure (only valid if nsize > 0) -- May return NULL for nsize == 0 (freeing never fails in Lua) - -### 1.2 Storage in global_State - -```cpp -/* 1. Memory Allocator - Memory allocation management */ -class MemoryAllocator { -private: - lua_Alloc frealloc; /* function to reallocate memory */ - void *ud; /* auxiliary data to 'frealloc' */ - -public: - inline lua_Alloc getFrealloc() const noexcept { return frealloc; } - inline void setFrealloc(lua_Alloc f) noexcept { frealloc = f; } - inline void* getUd() const noexcept { return ud; } - inline void setUd(void* u) noexcept { ud = u; } -}; -``` - -**Location:** `/home/user/lua_cpp/src/core/lstate.h` (lines 647-657) - ---- - -## 2. Public API Functions - -### 2.1 State Creation with Custom Allocator - -```c -LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud, unsigned seed); -``` - -**Location:** `/home/user/lua_cpp/include/lua.h` (line 199) -**Implementation:** `/home/user/lua_cpp/src/core/lstate.cpp` (lines 346-399) - -**Process:** -1. Allocates `global_State` structure using provided allocator `f` with tag `LUA_TTHREAD` -2. Initializes main thread embedded in global_State -3. Stores allocator function and user data -4. Initializes GC parameters and accounting -5. Calls `f_luaopen` protected to complete initialization -6. Returns NULL if initialization fails - -**Example:** -```c -// Default allocator from C standard library -void *my_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - free(ptr); - return NULL; - } - return realloc(ptr, nsize); -} - -lua_State *L = lua_newstate(my_alloc, NULL, 0); -``` - -### 2.2 Get Current Allocator - -```c -LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud); -``` - -**Location:** `/home/user/lua_cpp/include/lua.h` (line 414) -**Implementation:** `/home/user/lua_cpp/src/core/lapi.cpp` (lines 1319-1326) - -**Behavior:** -- Returns currently installed allocator function -- Optionally returns user data pointer via `**ud` parameter -- Thread-safe (uses lock) - -### 2.3 Change Allocator (Advanced Usage) - -```c -LUA_API void (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud); -``` - -**Location:** `/home/user/lua_cpp/include/lua.h` (line 415) -**Implementation:** `/home/user/lua_cpp/src/core/lapi.cpp` (lines 1329-1334) - -**Behavior:** -- Changes allocator and user data for a running state -- Thread-safe (uses lock) -- Allows switching allocators during runtime (advanced use case) - -**Warning:** Changing allocators while memory is outstanding can cause problems if the new allocator uses different memory pools than the old one. - ---- - -## 3. Internal Allocation Functions - -### 3.1 Core Allocation Macros and Functions - -All internal Lua code uses macros that ultimately call these functions: - -```c -/* Macros (defined in lmem.h) */ -#define luaM_malloc_(L, size, tag) /* allocate size bytes */ -#define luaM_realloc_(L, block, osize, nsize) /* reallocate block */ -#define luaM_saferealloc_(L, block, osize, nsize) /* realloc or throw error */ -#define luaM_free_(L, block, osize) /* free block of size osize */ - -/* Higher-level convenience macros */ -#define luaM_new(L, t) /* allocate single object of type t */ -#define luaM_newvector(L, n, t) /* allocate array of n objects */ -#define luaM_newobject(L, tag, s) /* allocate with GC tag */ -#define luaM_free(L, b) /* free single object */ -#define luaM_freearray(L, b, n) /* free array of n objects */ -``` - -**Location:** `/home/user/lua_cpp/src/memory/lmem.h` (lines 52-93) - -### 3.2 Internal Function Implementations - -```cpp -/* Generic reallocation - core function */ -void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize, size_t newsize); - -/* Safe reallocation - throws error on failure */ -void *luaM_saferealloc_ (lua_State *L, void *block, size_t oldsize, size_t newsize); - -/* Memory allocation (new blocks only) */ -void *luaM_malloc_ (lua_State *L, size_t size, int tag); - -/* Memory deallocation */ -void luaM_free_ (lua_State *L, void *block, size_t osize); - -/* Array growth helper - doubles size exponentially */ -void *luaM_growaux_ (lua_State *L, void *block, int nelems, int *size, - unsigned size_elem, int limit, const char *what); - -/* Array shrinking helper - used in parser cleanup */ -void *luaM_shrinkvector_ (lua_State *L, void *block, int *size, - int final_n, unsigned size_elem); -``` - -**Location:** `/home/user/lua_cpp/src/memory/lmem.h` (lines 82-93) -**Implementation:** `/home/user/lua_cpp/src/memory/lmem.cpp` (lines 97-215) - ---- - -## 4. Allocation Flow and GC Integration - -### 4.1 Fundamental Principle: GC Debt Tracking - -Lua uses a **GC debt** mechanism to track memory that's been allocated but not yet accounted for by the GC. This allows the GC to: -- Trigger automatically when debt exceeds thresholds -- Run emergency collections when allocations fail -- Maintain a predictable pause schedule - -**Key Data Structure:** -```cpp -class GCAccounting { -private: - l_mem totalbytes; /* Total allocated bytes + debt */ - l_mem debt; /* Bytes counted but not yet allocated */ - l_mem marked; /* Objects marked in current GC cycle */ - l_mem majorminor; /* Counter to control major-minor shifts */ - // ... -}; -``` - -**Location:** `/home/user/lua_cpp/src/core/lstate.h` (lines 661-686) - -### 4.2 Allocation Flow: `luaM_realloc_` - -``` -luaM_realloc_(L, block, osize, nsize) - │ - ├─ Get global_State from L - │ - ├─ Try initial allocation via allocator - │ Call: callfrealloc(g, block, osize, nsize) - │ │ = (*g->getFrealloc())(g->getUd(), block, osize, nsize) - │ - ├─ If allocation failed AND nsize > 0: - │ │ - │ └─ Try emergency collection + retry: - │ ├─ Check: canTryAgain(g) - │ │ = g->isComplete() && !g->getGCStopEm() - │ │ - │ ├─ If yes: luaC_fullgc(L, 1) /* run full GC */ - │ ├─ Then retry: callfrealloc(g, block, osize, nsize) - │ │ - │ └─ If still fails: return NULL (caller must handle) - │ - ├─ Update GC debt: - │ debt -= (nsize - osize) /* negative debt = credit */ - │ - └─ Return new block pointer -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.cpp` (lines 176-189) - -### 4.3 Safe Allocation Flow: `luaM_saferealloc_` - -``` -luaM_saferealloc_(L, block, osize, nsize) - │ - ├─ Call luaM_realloc_(...) - │ - ├─ If newblock == NULL and nsize > 0: - │ └─ Throw memory error: luaM_error(L) - │ └─ L->doThrow(LUA_ERRMEM) - │ - └─ Return newblock or throw -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.cpp` (lines 192-198) - -### 4.4 Allocation Flow: `luaM_malloc_` - -``` -luaM_malloc_(L, size, tag) - │ - ├─ If size == 0: - │ └─ Return NULL (no allocation needed) - │ - ├─ Try initial allocation via allocator - │ Call: firsttry(g, NULL, tag, size) - │ │ (tag is passed as osize for tracking purposes) - │ - ├─ If allocation failed: - │ │ - │ └─ Try emergency collection + retry - │ - ├─ If still failed: throw error (luaM_error) - │ - ├─ Update GC debt: - │ debt -= size - │ - └─ Return new block -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.cpp` (lines 201-215) - -### 4.5 Deallocation Flow: `luaM_free_` - -``` -luaM_free_(L, block, osize) - │ - ├─ Check invariant: (osize == 0) == (block == NULL) - │ - ├─ Call allocator to free: - │ callfrealloc(g, block, osize, 0) - │ - ├─ Update GC debt: - │ debt += osize /* increase debt (now has credit) */ - │ - └─ Return (deallocation always succeeds) -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.cpp` (lines 150-155) - ---- - -## 5. GC Debt and Automatic Collection - -### 5.1 Debt Semantics - -- **Positive debt**: Memory allocated but not yet counted by GC (most common) -- **Negative debt**: Credits available for allocation without triggering GC - -**Equation:** -``` -actual_allocated = totalbytes - debt -``` - -When `debt` becomes large (positive), GC runs and reduces it. - -### 5.2 Emergency Collection Conditions - -Collections are triggered only if: -```cpp -cantryagain(g) = g->isComplete() && !g->getGCStopEm() -``` - -Where: -- **`g->isComplete()`**: Global state fully initialized (not in initialization phase) -- **`!g->getGCStopEm()`**: Not already in a GC collection step (prevents recursive GC) - -**File:** `/home/user/lua_cpp/src/memory/lmem.cpp` (line 58) - -### 5.3 GC Parameters - -Stored in `GCParameters` subsystem: - -```cpp -class GCParameters { -private: - lu_byte params[LUA_GCPN]; /* GC tuning parameters */ - lu_byte currentwhite; /* Current white color for GC */ - lu_byte state; /* State of garbage collector */ - lu_byte kind; /* Kind of GC running (incremental/generational) */ - lu_byte stopem; /* Stops emergency collections */ - lu_byte stp; /* Control whether GC is running */ - lu_byte emergency; /* True if this is emergency collection */ -}; -``` - -**Location:** `/home/user/lua_cpp/src/core/lstate.h` (lines 689-724) - -### 5.4 GC Configuration Constants - -**Location:** `/home/user/lua_cpp/src/memory/lgc.h` - -```cpp -/* Incremental Collector */ - -/* Number of bytes must be LUAI_GCPAUSE% before starting new cycle */ -inline constexpr int LUAI_GCPAUSE = 250; -/* Meaning: GC runs when allocated bytes = (previous_collected * 250 / 100) */ - -/* Step multiplier: The collector handles LUAI_GCMUL% work units for - each new allocated word. (Each "work unit" ≈ sweeping 1 object) */ -inline constexpr int LUAI_GCMUL = 200; -/* Meaning: For each allocated unit, GC does 200% units of work */ - -/* How many bytes to allocate before next GC step */ -inline constexpr size_t LUAI_GCSTEPSIZE = (200 * sizeof(Table)); -/* Approximately 3200-6400 bytes depending on architecture */ - -/* Generational Collector */ - -/* Minor collections will shift to major ones after LUAI_MINORMAJOR% - bytes become old. */ -inline constexpr int LUAI_MINORMAJOR = 70; - -/* Major collections will shift to minor ones after collecting at least - LUAI_MAJORMINOR% of the new bytes. */ -inline constexpr int LUAI_MAJORMINOR = 50; - -/* A young (minor) collection will run after creating LUAI_GENMINORMUL% - new bytes. */ -inline constexpr int LUAI_GENMINORMUL = 20; -``` - ---- - -## 6. Allocation Sites and Object Types - -### 6.1 Core Objects Allocated - -| Object Type | Macros Used | Subsystem | Count | -|------------|------------|-----------|-------| -| **TString** | `luaM_new`, `luaM_newobject` | lstring.cpp | Dynamic | -| **Table** | `luaM_new`, `luaM_newvector` | ltable.cpp | Dynamic | -| **Proto** | `luaM_new`, `luaM_newvector` | lfunc.cpp | Dynamic | -| **UpVal** | `luaM_new` | lfunc.cpp | Dynamic | -| **Closure** | `luaM_new`, `luaM_newvector` | lfunc.cpp | Dynamic | -| **Udata** | `luaM_newobject` | lstring.cpp | Dynamic | -| **lua_State** | `luaM_new` | lstate.cpp | One per thread | -| **global_State** | Custom via allocator | lstate.cpp | One per VM | - -### 6.2 Collection Tracking - -All GC-managed objects inherit from `GCObject` and are tracked in: - -```cpp -class GCObjectLists { -private: - GCObject *allgc; /* All collectable objects */ - GCObject *finobj; /* Objects with finalizers */ - GCObject *gray; /* Gray objects (mark phase) */ - GCObject *grayagain; /* Objects to revisit */ - GCObject *weak; /* Weak-value tables */ - GCObject *ephemeron; /* Ephemeron tables */ - GCObject *allweak; /* All-weak tables */ - GCObject *tobefnz; /* To be finalized */ - GCObject *fixedgc; /* Never collected (strings, etc.) */ - // ... more lists for generational GC -}; -``` - -**Location:** `/home/user/lua_cpp/src/core/lstate.h` (lines 727-800) - -### 6.3 Usage Examples - -**String Allocation:** -```cpp -/* In lstring.cpp */ -tb->setHash(luaM_newvector(L, MINSTRTABSIZE, TString*)); -``` - -**Table Allocation:** -```cpp -/* In ltable.cpp */ -t->setNodeArray(luaM_newvector(L, size, Node)); -char *node = luaM_newblock(L, bsize); -``` - -**Function Allocation:** -```cpp -/* In lfunc.cpp */ -Proto *p = luaM_new(L, Proto); -``` - ---- - -## 7. Integration Points for Custom Allocators - -### 7.1 What a Custom Allocator Must Do - -When providing a custom `lua_Alloc` function, it must: - -1. **Handle all three operations** (allocation, reallocation, deallocation): - ```c - void *my_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { - if (nsize == 0) { - // Deallocation: free(ptr), return NULL - } else if (ptr == NULL) { - // Allocation: malloc(nsize), return pointer or NULL - } else { - // Reallocation: realloc(ptr, nsize), return pointer or NULL - } - } - ``` - -2. **Maintain size tracking**: Lua passes both old and new sizes; allocator can use `osize` to verify consistency - -3. **Return NULL on failure**: Only valid for `nsize > 0` (allocation/reallocation) - -4. **Support NULL pointers**: Must handle `free(NULL)` correctly (returning NULL is fine) - -5. **Preserve invariants**: - - `(osize == 0) == (ptr == NULL)` must hold - - Must handle freeing (nsize == 0) successfully - - Must return non-NULL for successful allocations - -### 7.2 Customization Points - -**Via `ud` (user data parameter):** -- Store pointer to allocator state/pool -- Pass context to memory management system -- Track statistics per allocation - -**Example:** -```cpp -struct MyAllocState { - void* memory_pool; - size_t total_allocated; - std::map allocations; -}; - -void *my_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { - MyAllocState *state = (MyAllocState *)ud; - - if (nsize == 0) { - if (ptr) { - state->allocations.erase(ptr); - free(ptr); - } - return NULL; - } - - void *newptr = realloc(ptr, nsize); - if (newptr && ptr != newptr) { - if (ptr) state->allocations.erase(ptr); - state->allocations[newptr] = nsize; - } - return newptr; -} -``` - -### 7.3 Common Use Cases - -1. **Memory limits**: Reject allocations if `state->total_allocated + nsize > limit` -2. **Statistics**: Track peak allocation, fragmentation -3. **Custom pools**: Use arena allocators, memory pools -4. **Debugging**: Log all allocations/deallocations -5. **Cleanup**: Defer actual freeing to batch later - ---- - -## 8. Error Handling - -### 8.1 Memory Errors - -When allocation fails (allocator returns NULL): - -1. **Attempt emergency GC** (if possible) -2. **Retry allocation** with freed memory -3. **Throw exception** if still fails - -```cpp -/* Define in llimits.h */ -#define luaM_error(L) (L)->doThrow(LUA_ERRMEM) -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.h` (line 17) - -### 8.2 Recovery Mechanism - -```c -/* Try allocate - fails silently */ -void *block = luaM_realloc_(L, NULL, 0, size); -if (block == NULL) { - // Caller handles NULL -} - -/* Try allocate - throws on failure */ -void *block = luaM_saferealloc_(L, NULL, 0, size); -// Never returns NULL for size > 0 -``` - -### 8.3 Overflow Protection - -Lua checks for multiplication overflow before allocating arrays: - -```cpp -/* Test whether n*e might overflow */ -#define luaM_testsize(n,e) \ - (sizeof(n) >= sizeof(size_t) && cast_sizet((n)) + 1 > MAX_SIZET/(e)) - -/* Check and error if might overflow */ -#define luaM_checksize(L,n,e) \ - (luaM_testsize(n,e) ? luaM_toobig(L) : cast_void(0)) -``` - -**File:** `/home/user/lua_cpp/src/memory/lmem.h` (lines 31-35) - ---- - -## 9. Memory Accounting and GC Interaction - -### 9.1 How GC "Knows" About Allocations - -Lua never directly asks the allocator "how much do I have allocated?" Instead: - -1. **Track allocations**: Each `luaM_malloc_`, `luaM_realloc_`, `luaM_free_` updates `debt` -2. **Periodically run GC**: When debt exceeds threshold, run a GC cycle -3. **GC marks objects**: Marks all reachable objects -4. **GC sweeps**: Frees unmarked objects and updates `totalbytes` - -### 9.2 Example: Allocation → GC Trigger → Collection - -``` -1. Application allocates 5MB - └─ luaM_malloc_ called - └─ debt increases by 5MB - └─ gcdebt = 5MB - -2. GC checks threshold - └─ if (gcdebt > threshold) → run GC - -3. GC cycle runs - └─ Mark all reachable objects - └─ Sweep unmarked objects - └─ Update totalbytes (actual allocated) - └─ Update debt (new pending) - -4. If memory freed - └─ debt decreases (or becomes negative) - └─ Application can proceed -``` - -### 9.3 GC State Machine - -``` -Pause → Propagate → Atomic → SweepAllGC → ... → SweepEnd → Pause -``` - -Debt tracking happens at each state transition. - ---- - -## 10. Special Allocator Features - -### 10.1 External String Allocators - -Strings can use separate allocators: - -```c -const char *(lua_pushexternalstring)(lua_State *L, - const char *s, size_t len, lua_Alloc falloc, void *ud); -``` - -**Location:** `/home/user/lua_cpp/include/lua.h` (line 284) - -This allows strings to be allocated from different memory pools than the rest of Lua. - -### 10.2 GC Tags for Allocation Context - -Some allocation macros include a "tag" parameter: - -```cpp -#define luaM_newobject(L, tag, s) luaM_malloc_(L, (s), tag) -``` - -The tag is passed to the allocator's `osize` parameter to indicate object type. - ---- - -## 11. Key Constraints and Requirements - -### 11.1 Allocator Contract Requirements - -| Requirement | Constraint | Reason | -|------------|-----------|--------| -| **Freeing size tracking** | `osize` must match allocation size | GC debt accounting | -| **NULL handling** | `free(NULL)` must be safe | Standard C semantics | -| **Failure semantics** | Only fail for `nsize > 0` | GC debt is never lost | -| **Consistency** | Allocator must be reentrant | GC can trigger during allocation | -| **Invariants** | `(osize == 0) == (ptr == NULL)` | Lua asserts this | - -### 11.2 No Alignment Requirements - -Lua doesn't require specific alignment. The allocator can return any properly allocated pointer. - -### 11.3 No Size Queries - -Lua never calls allocator to query allocated size. All size tracking is internal. - ---- - -## 12. Memory Limits and Configuration - -### 12.1 Size Limits - -```cpp -#define MAX_SIZE /* Minimum of MAX_SIZET and LUA_MAXINTEGER */ -#define MAX_LMEM /* Maximum l_mem value */ -``` - -**Location:** `/home/user/lua_cpp/src/memory/llimits.h` (lines 38-61) - -### 12.2 Buffer Size - -```cpp -#define LUAL_BUFFERSIZE \ - ((int)(16 * sizeof(void*) * sizeof(lua_Number))) -``` - -**Location:** `/home/user/lua_cpp/include/luaconf.h` (line 720) - -Default auxiliary library buffer size, configurable. - -### 12.3 Maximum Alignment - -```cpp -#define LUAI_MAXALIGN \ - lua_Number n; double u; void *s; lua_Integer i; long l -``` - -**Location:** `/home/user/lua_cpp/include/luaconf.h` (line 727) - -Used in `TValue` union to ensure proper alignment. - ---- - -## 13. Allocation Sites Distribution - -**Files using allocation macros (58 calls across):** - -- **Memory/Core**: lmem.cpp -- **Objects**: ltable.cpp, lstring.cpp, lfunc.cpp, lobject.cpp -- **Core VM**: lstate.cpp, ldo.cpp, ldebug.cpp -- **Compiler**: lparser.cpp, lcode.cpp -- **Serialization**: lundump.cpp - -**Most allocations by type:** -1. Tables and hash nodes -2. Strings (table, interning) -3. Functions and closures -4. Arrays in parser - ---- - -## 14. Summary and Integration Checklist - -### For Implementing Custom Allocator: - -- [ ] Implement `lua_Alloc` function with proper signature -- [ ] Handle all three operations: allocation, reallocation, deallocation -- [ ] Support `ud` parameter for custom context -- [ ] Never fail on deallocation (nsize == 0) -- [ ] Can return NULL only for allocations (nsize > 0) -- [ ] Pass to `lua_newstate()` or `lua_setallocf()` -- [ ] Track statistics if needed via `ud` -- [ ] Support emergency GC scenario (multiple failed attempts) - -### For Understanding Memory Flow: - -- [ ] GC debt is the key accounting mechanism -- [ ] Allocations add to debt -- [ ] GC runs when debt exceeds threshold -- [ ] Emergency collection happens on allocation failure -- [ ] No separate allocator queries needed -- [ ] Lua handles all size tracking internally - -### Critical Files: - -| File | Purpose | -|------|---------| -| `lmem.h` | Public allocation interface | -| `lmem.cpp` | Allocation implementation with GC debt | -| `lstate.h` | MemoryAllocator and GCAccounting subsystems | -| `lstate.cpp` | Allocator initialization in `lua_newstate` | -| `lapi.cpp` | Public API for changing allocators | -| `lgc.h` | GC parameters and debt triggering | - diff --git a/docs/NEXT_TASKS_RECOMMENDATIONS.md b/docs/NEXT_TASKS_RECOMMENDATIONS.md deleted file mode 100644 index d71fe77d..00000000 --- a/docs/NEXT_TASKS_RECOMMENDATIONS.md +++ /dev/null @@ -1,556 +0,0 @@ -# Next Tasks Recommendations - -**Date**: 2025-11-18 -**Current Status**: Phase 100 complete - All major enums converted to enum class -**Purpose**: Prioritized roadmap for continued project improvement - ---- - -## Executive Summary - -With Phase 100 complete and major architectural milestones achieved (100% encapsulation, SRP refactoring, GC modularization, enum class conversion), the project is at an excellent point to focus on **infrastructure, quality assurance, and completing modernization**. - -**Current State**: -- ✅ 19/19 classes fully encapsulated -- ✅ Phases 1-100 complete -- ✅ GC modularization: lgc.cpp reduced from 1,950 → 936 lines (52% reduction) -- ✅ 6 GC modules extracted (gc_core, gc_marking, gc_collector, gc_sweeping, gc_finalizer, gc_weak) -- ✅ ~500 macros converted (37% of convertible) -- ✅ Zero warnings, all tests passing -- ✅ Performance: 4.20s baseline (target ≤4.33s) - ---- - -## 🎯 Top Priority Tasks - -### 1. **Add CI/CD Infrastructure** ⭐⭐⭐ (HIGHEST PRIORITY) - -**Status**: No automated testing detected -**Effort**: 4-6 hours -**Risk**: Very Low -**Value**: Very High - -**Why This is Critical**: -- Protects 100 phases of hard work from regressions -- Automated quality gates on every PR -- Performance regression detection -- Build verification across compilers -- Professional development workflow - -**Implementation Plan**: - -```yaml -# .github/workflows/ci.yml -name: CI/CD Pipeline - -on: [push, pull_request] - -jobs: - build-and-test: - strategy: - matrix: - compiler: [gcc-13, clang-16] - build-type: [Release, Debug] - - performance-check: - - Run 5x benchmark - - Fail if avg > 4.33s - - Compare to baseline - - code-quality: - - Check warnings (-Werror) - - Run static analysis - - Verify formatting -``` - -**Deliverables**: -- `.github/workflows/ci.yml` - Main CI pipeline -- `.github/workflows/benchmark.yml` - Performance tracking -- GitHub Actions badge in README -- Automated PR checks - -**Success Criteria**: -- ✅ All tests run automatically -- ✅ Performance regressions caught -- ✅ Build verified on GCC and Clang -- ✅ Zero-click validation for PRs - ---- - -### 2. **Add Test Coverage Metrics** ⭐⭐ - -**Status**: No gcov/lcov integration detected -**Effort**: 2-3 hours -**Risk**: Low -**Value**: High - -**Benefits**: -- Understand current test coverage -- Identify untested code paths -- Guide future testing efforts -- Coverage badge for README -- Historical coverage tracking - -**Implementation**: -```cmake -# CMakeLists.txt additions -option(LUA_ENABLE_COVERAGE "Enable coverage reporting" OFF) - -if(LUA_ENABLE_COVERAGE) - add_compile_options(--coverage -fprofile-arcs -ftest-coverage) - add_link_options(--coverage) -endif() -``` - -**Integration**: -- Add to GitHub Actions workflow -- Generate HTML coverage reports -- Optional: Upload to Codecov.io -- Track coverage trends over time - -**Deliverables**: -- CMake coverage option -- CI job for coverage generation -- Coverage report artifacts -- Documentation in CMAKE_BUILD.md - ---- - -### 3. **Complete Remaining Macro Conversions** ⭐⭐ - -**Status**: ~500 converted (37%), ~75 remain -**Effort**: 8-10 hours total -**Risk**: Low (well-established pattern) -**Value**: Medium-High - -**Remaining Batches**: - -**Batch 1: lopcodes.h - Instruction Manipulation (25 macros)** -- Effort: 2-3 hours -- Priority: High (VM critical) -- Examples: `GETARG_A`, `SETARG_Bx`, `CREATE_ABC` - -**Batch 2: llimits.h - Utility Macros (10-15 macros)** -- Effort: 1-2 hours -- Priority: Medium -- Examples: `cast_*`, `check_exp`, utility functions - -**Batch 3: lctype.h - Character Type Checks (10 macros)** -- Effort: 1 hour -- Priority: Low -- Examples: `lisdigit`, `lisspace`, `lisalpha` - -**Batch 4: Miscellaneous Simple Macros (15 macros)** -- Effort: 2 hours -- Priority: Low -- Various simple expression macros - -**Keep as Macros** (Do NOT convert): -- Configuration macros (SIZE_*, LUA_IDSIZE, etc.) -- Token-pasting macros (setgcparam, etc.) -- Public API macros (lua.h, lauxlib.h) - -**Conversion Pattern**: -```cpp -// Before -#define GETARG_A(i) getarg(i, POS_A, SIZE_A) - -// After -inline constexpr int GETARG_A(Instruction i) noexcept { - return getarg(i, POS_A, SIZE_A); -} -``` - -**Success Criteria**: -- All convertible macros → inline functions -- Zero performance regression (≤4.33s) -- All tests passing -- Documented in CLAUDE.md - ---- - -## 🔍 Secondary Priority Tasks - -### 4. **Document GC Modularization Achievement** ⭐ - -**Observation**: Your GC work is **impressive** but undocumented! - -**Current State**: -- lgc.cpp: 1,950 lines → 936 lines (52% reduction!) ✅ -- 6 modules extracted in src/memory/gc/: - - gc_core.cpp/h (2,139 + 3,620 lines) - - gc_marking.cpp/h (5,174 + 14,682 lines) - - gc_collector.cpp/h (4,573 + 12,186 lines) - - gc_sweeping.cpp/h (2,857 + 8,748 lines) - - gc_finalizer.cpp/h (2,871 + 7,213 lines) - - gc_weak.cpp/h (3,122 + 11,780 lines) - -**Effort**: 1-2 hours -**Deliverable**: `GC_MODULARIZATION_SUMMARY.md` - -**Contents**: -- Overview of extraction work -- Before/after metrics (line counts, module breakdown) -- Architecture improvements -- Performance impact (if benchmarked) -- Benefits for maintainability -- Lessons learned - -**Why This Matters**: -- Documents major architectural achievement -- Helps future contributors understand GC structure -- Similar value to REFACTORING_SUMMARY.md (SRP work) -- Completes the GC simplification story - ---- - -### 5. **Static Analysis Integration** ⭐ - -**Status**: No automated static analysis detected -**Effort**: 3-4 hours -**Risk**: Low -**Value**: Medium - -**Tools to Integrate**: - -**clang-tidy** - Modern C++ best practices -```yaml -# .clang-tidy -Checks: > - modernize-*, - performance-*, - readability-*, - bugprone-* -``` - -**cppcheck** - Additional static analysis -- Unused functions -- Memory leaks -- Null pointer dereferences - -**include-what-you-use** - Header optimization -- Minimize header dependencies -- Forward declarations -- Include cleanup - -**Integration**: -- Add to CI pipeline -- Pre-commit hooks (optional) -- Fix identified issues incrementally - -**Benefits**: -- Catch bugs early -- Enforce modern C++ patterns -- Improve code quality -- Reduce technical debt - ---- - -### 6. **Investigate Remaining TODOs** ⭐ - -**Status**: Only 6 TODO/FIXME comments found -**Locations**: -- src/objects/lobject.cpp (4 occurrences) -- src/libraries/loslib.cpp (1 occurrence) -- src/libraries/lstrlib.cpp (1 occurrence) - -**Effort**: 1-2 hours -**Risk**: Low - -**Actions**: -1. Review each TODO -2. Determine if still relevant -3. Create issues for non-trivial work -4. Fix trivial items immediately -5. Remove obsolete TODOs - ---- - -## 📊 Analysis Tasks - -### 7. **Performance Profiling Session** ⭐⭐ - -**Goal**: Deep understanding of performance characteristics -**Effort**: 2-3 hours -**Value**: High (learning + optimization opportunities) - -**Tools**: -```bash -# perf profiling -perf record -g ../build/lua all.lua -perf report - -# Cachegrind -valgrind --tool=cachegrind ../build/lua all.lua -cg_annotate cachegrind.out.* - -# Callgrind -valgrind --tool=callgrind ../build/lua all.lua -kcachegrind callgrind.out.* -``` - -**Analysis Goals**: -- Identify hot functions beyond lvm.cpp -- Cache miss patterns -- Branch mispredictions -- Memory access patterns -- Function call overhead - -**Deliverable**: `PERFORMANCE_PROFILE_2025.md` - -**Contents**: -- Top 20 hot functions -- Cache analysis -- Optimization opportunities -- Comparison with original C implementation -- Recommendations for Phase 101+ - ---- - -### 8. **Memory Layout Optimization Analysis** ⭐ - -**Goal**: Optimize struct layouts for cache efficiency -**Effort**: 4-6 hours -**Risk**: Medium (changes memory layout) -**Value**: Medium-High - -**Analysis**: -```bash -# Check struct sizes and padding -pahole build/lua - -# Common issues: -# - Padding between fields -# - Cache line splits -# - False sharing -``` - -**Key Structures to Analyze**: -- `lua_State` (hot path) -- `Table` (very common) -- `TValue` (everywhere) -- `CallInfo` (call stack) -- `global_State` (singleton) - -**Optimizations**: -1. Reorder fields by access patterns -2. Group frequently-accessed fields -3. Align to cache lines (64 bytes) -4. Minimize padding -5. Consider `[[no_unique_address]]` for empty bases - -**Success Criteria**: -- Documented current layouts -- Identified optimization opportunities -- Measured performance impact -- No regression (≤4.33s) - ---- - -## 🎨 Polish & Documentation Tasks - -### 9. **Enhance Project Documentation** ⭐ - -**Current State**: Good technical docs, could improve discoverability -**Effort**: 3-4 hours -**Value**: Medium - -**Improvements**: - -**README.md Enhancements**: -- Add CI/CD badges (build status, coverage) -- Visual architecture diagram -- Feature comparison table (vs original Lua) -- Performance charts - -**New Documentation**: -- CONTRIBUTING.md - How to contribute -- CODING_STANDARDS.md - C++23 style guide -- ARCHITECTURE.md - High-level overview with diagrams - -**Visual Aids**: -```mermaid -# Class hierarchy diagram -# Module dependency graph -# GC phase state machine -# Call/return flow -``` - -**Benefits**: -- Easier onboarding for contributors -- Professional appearance -- Clear standards -- Better understanding of architecture - ---- - -### 10. **Code Cleanup Sweep** ⭐ - -**Goal**: Final polish pass -**Effort**: 2-3 hours -**Risk**: Very Low -**Value**: Low-Medium - -**Tasks**: - -**Remove Dead Code**: -- Commented-out code (if any) -- Unused functions/methods -- Obsolete includes - -**Formatting**: -- Consistent style (clang-format) -- Whitespace cleanup -- Comment formatting - -**Modern C++ Attributes**: -```cpp -[[nodiscard]] inline bool isEmpty() const noexcept; -[[maybe_unused]] inline void debugPrint() const; -[[fallthrough]] // in switch statements -``` - -**Documentation Comments**: -- Add missing function documentation -- Doxygen-style comments -- Parameter descriptions - ---- - -## 📅 Recommended Timeline - -### Phase 101: Infrastructure & Quality (Week 1) -- **Day 1-2**: CI/CD implementation ✅ -- **Day 3**: Test coverage integration -- **Day 4**: Static analysis setup -- **Day 5**: Documentation updates - -**Deliverables**: Production-ready quality gates - ---- - -### Phase 102: Complete Modernization (Week 2) -- **Day 1-2**: lopcodes.h macro conversion (25 macros) -- **Day 3**: llimits.h macro conversion (15 macros) -- **Day 4**: lctype.h + misc macros (25 macros) -- **Day 5**: GC modularization documentation - -**Deliverables**: Macro conversion milestone complete - ---- - -### Phase 103: Analysis & Optimization (Week 3) -- **Day 1-2**: Performance profiling session -- **Day 3-4**: Memory layout analysis -- **Day 5**: TODO cleanup + code polish - -**Deliverables**: Optimization roadmap - ---- - -## 🏆 Success Metrics - -### After Phase 101 (Infrastructure) -- ✅ CI/CD pipeline active -- ✅ Automated testing on every PR -- ✅ Performance regression detection -- ✅ Coverage reporting -- ✅ Static analysis integrated - -### After Phase 102 (Modernization) -- ✅ 100% convertible macros → inline functions -- ✅ GC work documented -- ✅ All major modernization complete - -### After Phase 103 (Optimization) -- ✅ Performance profile documented -- ✅ Optimization opportunities identified -- ✅ Memory layouts analyzed -- ✅ Clean codebase - ---- - -## 🎯 My Strong Recommendation - -**Start with Task #1: CI/CD Infrastructure** - -**Rationale**: -1. ✅ **Protects investment** - 100 phases of work deserve protection -2. ✅ **Enables velocity** - Faster development with automated checks -3. ✅ **Professional quality** - Production-ready project -4. ✅ **Low risk** - Pure additive, no code changes -5. ✅ **High value** - Benefits every future change -6. ✅ **Quick win** - 4-6 hours for complete pipeline - -**Immediate Benefits**: -- Catch regressions automatically -- Build verification across compilers -- Performance tracking over time -- Professional appearance -- Easier collaboration - -**Then follow with**: -- Task #2 (Coverage) - Understand test gaps -- Task #3 (Macros) - Complete modernization milestone -- Task #4 (GC docs) - Document achievements - ---- - -## 📊 Project Health Summary - -### Strengths -- ✅ 100% encapsulation achieved -- ✅ Comprehensive documentation (29 files) -- ✅ Zero warnings, all tests passing -- ✅ Excellent performance (within 3% of baseline) -- ✅ Modern C++23 throughout -- ✅ Major refactorings complete (SRP, GC modularization) - -### Gaps (Addressable) -- ⚠️ No CI/CD automation -- ⚠️ No coverage metrics -- ⚠️ 37% macro conversion (63% remain) -- ⚠️ No static analysis -- ⚠️ GC work undocumented - -### Technical Debt -- ✅ Very Low - Most major work complete -- 6 TODOs in codebase -- Minor cleanup opportunities -- Documentation could be enhanced - ---- - -## 🔮 Future Opportunities (Beyond Phase 103) - -### Advanced Optimizations -- Profile-guided optimization (PGO) -- Link-time optimization (LTO) tuning -- SIMD vectorization opportunities -- Memory allocator optimization - -### Modernization -- C++23 modules (when compiler support matures) -- Coroutine integration (for Lua coroutines) -- std::expected for error handling -- Ranges library integration - -### Architectural -- Optional features (compile-time configuration) -- Plugin system for extensions -- Embedded system optimizations -- Multi-threading investigation - ---- - -## Conclusion - -The project is at an **excellent milestone** (Phase 100) with solid architecture, comprehensive documentation, and good performance. The natural next step is to **add infrastructure** (CI/CD, coverage, static analysis) to protect this investment and enable faster, safer development going forward. - -Completing macro conversion (Task #3) would achieve **100% modernization** of all convertible constructs, marking a major project milestone. - ---- - -**Next Action**: Implement CI/CD (Task #1) for immediate quality improvements - -**Last Updated**: 2025-11-18 diff --git a/docs/PHASE_115_SUMMARY.md b/docs/PHASE_115_SUMMARY.md deleted file mode 100644 index ba4bdee8..00000000 --- a/docs/PHASE_115_SUMMARY.md +++ /dev/null @@ -1,394 +0,0 @@ -# Phase 115: std::span Adoption - Summary - -**Date**: November 21, 2025 -**Status**: Phases 115.1-115.2 Complete, Phase 115.3 Deferred -**Performance**: ⚠️ 4.70s avg (11.9% regression from 4.20s baseline) - ---- - -## Overview - -Phase 115 focused on adopting `std::span` to improve type safety and code clarity throughout the Lua C++ codebase. This phase built upon Phase 112's addition of span accessors to Proto and ProtoDebugInfo. - -### Objectives -- ✅ Add std::span support for string operations (Phase 115.1) -- ✅ Use existing Proto span accessors throughout codebase (Phase 115.2) -- ⏸️ Add Table::getArraySpan() accessor (Phase 115.3 - DEFERRED) - ---- - -## Phase 115.1: String Operations (COMPLETED) - -### Files Modified: 7 files, 40+ sites - -**Core String Functions** (lstring.h/cpp): -- `luaS_hash()` - String hashing -- `luaS_newlstr()` - String creation -- `internshrstr()` - String interning (internal) - -**String Utilities** (lobject.h/cpp): -- `luaO_chunkid()` - Chunk ID formatting -- `addstr2buff()` - Buffer string operations -- `addstr()` - String append helper - -**Pattern Matching** (lstrlib.cpp): -- `lmemfind()` - Memory search -- `nospecials()` - Pattern check (now returns `bool`!) -- `prepstate()` - Match state preparation - -**Buffer Operations** (lauxlib.h/cpp): -- `luaL_addlstring()` - Buffer string addition - -### Architecture: Dual-API Pattern - -**Initial Approach** (span-primary): 17% regression (4.91s) -```cpp -// Primary implementation used span -LUAI_FUNC unsigned luaS_hash(std::span str, unsigned seed); - -// Wrapper for C compatibility -inline unsigned luaS_hash(const char *str, size_t l, unsigned seed) { - return luaS_hash(std::span(str, l), seed); -} -``` - -**Optimized Approach** (pointer-primary): 8% improvement -```cpp -// Primary implementation uses pointer+size for hot paths -LUAI_FUNC unsigned luaS_hash(const char *str, size_t l, unsigned seed); - -// Convenience overload for new code -inline unsigned luaS_hash(std::span str, unsigned seed) { - return luaS_hash(str.data(), str.size(), seed); -} -``` - -**Key Insight**: Hot paths must avoid unnecessary span construction. The dual-API pattern provides: -- ✅ Zero overhead for existing code paths -- ✅ Span convenience for new code -- ✅ C API compatibility through function overloading -- ✅ Gradual adoption without forcing conversions - -### Performance Impact - -**Initial**: 4.91s avg (17% regression) -**After optimization**: 4.53s avg (7.8% regression) -**Best individual runs**: 4.05s, 4.06s (better than 4.20s baseline!) - -**Commits**: -- `0aa81ee` - Initial span adoption -- `08c8774` - Optimization (pointer-primary pattern) - ---- - -## Phase 115.2: Proto Span Accessors (COMPLETED) - -### Files Modified: 2 files, 23 sites - -**ldebug.cpp** (8 conversions): -- `getbaseline()` → `getAbsLineInfoSpan()` with `std::upper_bound` -- `luaG_getfuncline()` → `getLineInfoSpan()` -- `nextline()` → `getLineInfoSpan()` -- `collectvalidlines()` → `getLineInfoSpan()` + `getCodeSpan()` -- `changedline()` → `getLineInfoSpan()` - -**lundump.cpp** (15 conversions): -- `loadCode()` → `getCodeSpan()` -- `loadConstants()` → `getConstantsSpan()` with range-based for -- `loadUpvalues()` → `getUpvaluesSpan()` with range-based for -- `loadDebug()` → `getLineInfoSpan()`, `getAbsLineInfoSpan()`, `getLocVarsSpan()` - -### Code Examples - -**Before** (ldebug.cpp): -```cpp -static int getbaseline (const Proto *f, int pc, int *basepc) { - if (f->getAbsLineInfoSize() == 0 || pc < f->getAbsLineInfo()[0].getPC()) { - *basepc = -1; - return f->getLineDefined(); - } - const AbsLineInfo* absLineInfo = f->getAbsLineInfo(); - int size = f->getAbsLineInfoSize(); - auto it = std::upper_bound(absLineInfo, absLineInfo + size, pc, ...); - // ... -} -``` - -**After**: -```cpp -static int getbaseline (const Proto *f, int pc, int *basepc) { - auto absLineInfoSpan = f->getDebugInfo().getAbsLineInfoSpan(); - if (absLineInfoSpan.empty() || pc < absLineInfoSpan[0].getPC()) { - *basepc = -1; - return f->getLineDefined(); - } - auto it = std::upper_bound(absLineInfoSpan.begin(), absLineInfoSpan.end(), pc, ...); - // ... -} -``` - -**Benefits**: -- No separate size variable needed -- Bounds checking in debug builds -- Standard algorithms work naturally with span iterators -- Clearer intent (array view, not raw pointer manipulation) - -### Performance Impact - -**After Phase 115.2**: 4.70s avg (11.9% regression from 4.20s baseline) - -**Commits**: -- `6f830e7` - ldebug.cpp conversions -- `943a3ef` - lundump.cpp conversions - ---- - -## Phase 115.3: Table Arrays (DEFERRED) - -### Reason for Deferral - -Performance after Phases 115.1-115.2: -- **Current**: 4.70s avg (range: 4.56s-4.87s) -- **Target**: ≤4.33s (3% tolerance from 4.20s baseline) -- **Regression**: 11.9% above baseline - -**Decision**: Phase 115.3 was marked as "optional, if no regression". Given the current 11.9% regression, proceeding with Table array conversions (marked as MEDIUM RISK in the analysis) is not advisable. - -### What Phase 115.3 Would Have Done - -**Proposed**: -```cpp -class Table { -public: - std::span getArraySpan() noexcept { - return std::span(array, asize); - } - std::span getArraySpan() const noexcept { - return std::span(array, asize); - } -}; - -// Usage -for (Value& slot : t->getArraySpan()) { - // Safer iteration, prevents off-by-one errors -} -``` - -**Estimated Impact**: 10-15 array iteration loops in ltable.cpp, lvm_table.cpp - -**Risk Assessment**: MEDIUM - Table operations are performance-sensitive, and we're already above target. - ---- - -## Performance Analysis - -### Benchmark Results (5-run average) - -| Phase | Avg Time | Min | Max | Variance | vs Baseline | -|-------|----------|-----|-----|----------|-------------| -| Baseline (4.20s) | 4.20s | - | - | - | 0% | -| After 115.1 (initial) | 4.91s | - | - | - | +17% | -| After 115.1 (optimized) | 4.53s | 4.05s | 4.98s | ~1s | +7.8% | -| After 115.2 | 4.70s | 4.56s | 4.87s | 0.31s | +11.9% | - -### Performance Observations - -1. **High Variance**: 0.31s-1s spread suggests system load factors -2. **Best Individual Runs**: 4.05s, 4.06s beat the 4.20s baseline -3. **Span Construction Overhead**: Initial 17% regression demonstrated that span construction in hot paths is costly -4. **Pointer-Primary Pattern**: Reduced regression from 17% to 7.8% (8% improvement) -5. **Phase 115.2 Impact**: 4.53s → 4.70s (3.7% degradation) - -### Root Cause Investigation Needed - -**Potential Issues**: -1. ❓ **Compiler optimization barriers**: Are spans preventing optimizations? -2. ❓ **Debug info overhead**: .data() calls on spans might not fully optimize -3. ❓ **System variance**: Wide ranges suggest external factors -4. ❓ **Test methodology**: Single-run vs multi-run averages - -**Recommendation**: Before proceeding with Phase 115.3 or additional span adoption: -1. Investigate why Phase 115.2 added 3.7% overhead (should be zero-cost) -2. Profile hot paths to identify bottlenecks -3. Consider selective reversion if specific conversions are problematic -4. Validate with multiple benchmark runs under controlled conditions - ---- - -## Benefits Achieved - -Despite performance concerns, Phase 115 delivered significant code quality improvements: - -### Type Safety -✅ Size information included in span type -✅ Compile-time detection of size mismatches -✅ Bounds checking in debug builds (`-D_GLIBCXX_DEBUG`) - -### Modern C++ Idioms -✅ Range-based for loops (13 sites converted) -✅ Standard algorithms work with span iterators -✅ Cleaner interfaces (no separate pointer+size parameters) - -### Maintainability -✅ Reduced pointer arithmetic (23 sites) -✅ Clearer intent (array views vs raw pointers) -✅ Fewer magic size variables - -### Code Examples - -**Range-based for** (lundump.cpp): -```cpp -// Before -std::for_each_n(f->getConstants(), n, [](TValue& v) { - setnilvalue(&v); -}); - -// After -auto constantsSpan = f->getConstantsSpan(); -for (TValue& v : constantsSpan) { - setnilvalue(&v); -} -``` - -**Eliminated separate size** (ldebug.cpp): -```cpp -// Before -const AbsLineInfo* absLineInfo = f->getAbsLineInfo(); -int size = f->getAbsLineInfoSize(); -auto it = std::upper_bound(absLineInfo, absLineInfo + size, pc, ...); - -// After -auto absLineInfoSpan = f->getDebugInfo().getAbsLineInfoSpan(); -auto it = std::upper_bound(absLineInfoSpan.begin(), absLineInfoSpan.end(), pc, ...); -``` - ---- - -## Lessons Learned - -### 1. Zero-Cost Abstraction Isn't Always Zero-Cost - -**Expected**: std::span should compile to identical code as pointer+size -**Reality**: Span construction in hot paths added measurable overhead -**Solution**: Dual-API pattern keeps hot paths fast while enabling span where convenient - -### 2. Measurement is Critical - -- Initial span-primary approach: 17% regression (would have failed) -- Pointer-primary optimization: Reduced to 7.8% regression -- Continuous benchmarking caught issues early - -### 3. Gradual Adoption Works Better - -- Don't force existing code to use spans -- Provide span overloads for new code -- Let natural code evolution adopt spans where beneficial -- Keep performance-critical paths unchanged - -### 4. Profile Before Proceeding - -Phase 115.2 added unexpected 3.7% overhead. Before continuing: -- Need to understand why "zero-cost" abstractions have cost -- Should profile to identify specific hot spots -- May need to be more selective about conversions - ---- - -## Recommendations - -### Immediate Actions - -1. **✅ COMPLETE Phases 115.1-115.2**: Code is functional, tests pass -2. **⏸️ DEFER Phase 115.3**: Don't add Table spans until performance improves -3. **📊 INVESTIGATE**: Profile to understand 11.9% regression source -4. **🎯 OPTIMIZE**: Target bringing performance back to ≤4.33s - -### Performance Recovery Options - -**Option A: Accept Current State** -- 11.9% regression is significant but code is more maintainable -- May be acceptable trade-off for type safety benefits -- Document as known issue, revisit if critical - -**Option B: Selective Reversion** -- Profile to find hot spots -- Revert specific span conversions if they're bottlenecks -- Keep spans in cold paths (debug info, serialization) - -**Option C: Compiler Investigation** -- Try different optimization flags (`-O3`, `-flto`, `-march=native`) -- Check if specific GCC/Clang versions optimize spans better -- Investigate PGO (Profile-Guided Optimization) - -**Option D: Further Optimization** -- Review lundump.cpp conversions (added 3.7% overhead) -- Consider caching spans instead of recreating -- Ensure `inline` hints are respected - -### Future Work - -**If Performance Improves**: -- ✅ Proceed with Phase 115.3 (Table arrays) -- ✅ Convert remaining Proto accessor usage -- ✅ Add spans to other array-based structures - -**Additional Opportunities**: -- LuaStack range operations (analysis identified ~20 sites) -- Compiler array operations (low priority, cold path) -- Debug info iteration (low priority, rarely executed) - ---- - -## Statistics - -### Phase 115.1 -- **Files Modified**: 7 -- **Conversions**: 40+ sites -- **Commits**: 2 (0aa81ee, 08c8774) - -### Phase 115.2 -- **Files Modified**: 2 (ldebug.cpp, lundump.cpp) -- **Conversions**: 23 sites (8 + 15) -- **Commits**: 2 (6f830e7, 943a3ef) - -### Total Phase 115 -- **Files Modified**: 9 unique files -- **Conversions**: 60+ sites -- **Commits**: 4 -- **Lines Changed**: ~220 insertions, ~180 deletions -- **Performance**: 4.70s avg (target: ≤4.33s, baseline: 4.20s) -- **Test Status**: ✅ All tests passing ("final OK !!!") - ---- - -## Conclusion - -Phase 115 successfully demonstrated std::span adoption in a C++ modernization context: - -**Achievements**: -✅ Established dual-API pattern for zero-overhead span support -✅ Converted 60+ sites to use spans without breaking C API -✅ Improved type safety and code clarity -✅ Maintained test compatibility (all tests passing) - -**Challenges**: -⚠️ 11.9% performance regression (above 3% tolerance) -⚠️ "Zero-cost" abstractions showed measurable cost -⚠️ High variance suggests system factors or measurement issues - -**Status**: -- Phases 115.1-115.2: **COMPLETE** -- Phase 115.3: **DEFERRED** pending performance investigation - -**Next Steps**: -1. Profile to understand regression source -2. Consider selective optimizations or reversions -3. Document performance findings -4. Revisit Phase 115.3 when performance is within tolerance - ---- - -**Related Documents**: -- Initial Analysis: Exploration agent output (Phase 115 planning) -- Phase 112: Proto span accessor additions (foundation work) -- CLAUDE.md: Project overview and guidelines diff --git a/docs/PHASE_36_2_PLAN.md b/docs/PHASE_36_2_PLAN.md deleted file mode 100644 index 2f3a19c2..00000000 --- a/docs/PHASE_36_2_PLAN.md +++ /dev/null @@ -1,296 +0,0 @@ -# Phase 36.2: Encapsulate lua_State - Incremental Plan - -## Overview -Encapsulate 23 public fields in lua_State by adding accessor methods and updating ~375 call sites across 20 files. - -**Strategy**: Work on one field group at a time, compile after each group to catch errors early. - ---- - -## Step 1: Stack Position Fields (Most Frequent) -**Fields**: `top`, `stack`, `stack_last`, `tbclist` (type: `StkIdRel`) - -### Add to lstate.h (private section + public accessors): -```cpp -private: - StkIdRel top; - StkIdRel stack; - StkIdRel stack_last; - StkIdRel tbclist; - -public: - // Stack accessors - return references to allow `.p` access - StkIdRel& getTop() noexcept { return top; } - const StkIdRel& getTop() const noexcept { return top; } - void setTop(StkIdRel t) noexcept { top = t; } - - StkIdRel& getStack() noexcept { return stack; } - const StkIdRel& getStack() const noexcept { return stack; } - void setStack(StkIdRel s) noexcept { stack = s; } - - StkIdRel& getStackLast() noexcept { return stack_last; } - const StkIdRel& getStackLast() const noexcept { return stack_last; } - void setStackLast(StkIdRel sl) noexcept { stack_last = sl; } - - StkIdRel& getTbclist() noexcept { return tbclist; } - const StkIdRel& getTbclist() const noexcept { return tbclist; } - void setTbclist(StkIdRel tbc) noexcept { tbclist = tbc; } -``` - -### Update macro in lstate.h: -```cpp -#define stacksize(th) cast_int((th)->getStackLast().p - (th)->getStack().p) -``` - -### Update call sites: -- `L->top` → `L->getTop()` (~150 occurrences, mostly in lapi.cpp, lvm.cpp, ldo.cpp) -- `L->stack` → `L->getStack()` (~50 occurrences) -- `L->stack_last` → `L->getStackLast()` (~20 occurrences) -- `L->tbclist` → `L->getTbclist()` (~10 occurrences) - -### Build and test: -```bash -cmake --build build && cd testes && ../build/lua all.lua -``` - ---- - -## Step 2: CallInfo and Base Fields -**Fields**: `ci`, `base_ci` (already has `getCallInfo()`, keep it) - -### Add to lstate.h: -```cpp -private: - CallInfo *ci; - CallInfo base_ci; - -public: - CallInfo* getCI() noexcept { return ci; } - const CallInfo* getCI() const noexcept { return ci; } - void setCI(CallInfo* c) noexcept { ci = c; } - CallInfo** getCIPtr() noexcept { return &ci; } - - CallInfo* getBaseCI() noexcept { return &base_ci; } - const CallInfo* getBaseCI() const noexcept { return &base_ci; } -``` - -### Update call sites: -- `L->ci` → `L->getCI()` (~60 occurrences) -- `&L->base_ci` → `L->getBaseCI()` (~8 occurrences) - -### Build and test - ---- - -## Step 3: GC and State Management Fields -**Fields**: `l_G`, `openupval`, `gclist`, `twups` - -### Add to lstate.h: -```cpp -private: - global_State *l_G; - UpVal *openupval; - GCObject *gclist; - lua_State *twups; - -public: - global_State* getGlobalState() noexcept { return l_G; } // Already exists - const global_State* getGlobalState() const noexcept { return l_G; } - void setGlobalState(global_State* g) noexcept { l_G = g; } - global_State*& getGlobalStateRef() noexcept { return l_G; } // For G() macro - - UpVal* getOpenUpval() noexcept { return openupval; } - void setOpenUpval(UpVal* uv) noexcept { openupval = uv; } - UpVal** getOpenUpvalPtr() noexcept { return &openupval; } - - GCObject* getGclist() noexcept { return gclist; } - void setGclist(GCObject* gc) noexcept { gclist = gc; } - GCObject** getGclistPtr() noexcept { return &gclist; } - - lua_State* getTwups() noexcept { return twups; } - void setTwups(lua_State* tw) noexcept { twups = tw; } - lua_State** getTwupsPtr() noexcept { return &twups; } -``` - -### Update G() macro: -```cpp -constexpr global_State*& G(lua_State* L) noexcept { return L->getGlobalStateRef(); } -``` - -### Update call sites: -- `L->openupval` → `L->getOpenUpval()` (~15 occurrences) -- `L->gclist` → `L->getGclist()` (~8 occurrences) -- `L->twups` → `L->getTwups()` (~6 occurrences) - -### Build and test - ---- - -## Step 4: Status and Error Handling Fields -**Fields**: `status`, `errorJmp`, `errfunc` - -### Add to lstate.h: -```cpp -private: - TStatus status; - lua_longjmp *errorJmp; - ptrdiff_t errfunc; - -public: - TStatus getStatus() const noexcept { return status; } // Already exists - void setStatus(TStatus s) noexcept { status = s; } - - lua_longjmp* getErrorJmp() noexcept { return errorJmp; } - void setErrorJmp(lua_longjmp* ej) noexcept { errorJmp = ej; } - lua_longjmp** getErrorJmpPtr() noexcept { return &errorJmp; } - - ptrdiff_t getErrFunc() const noexcept { return errfunc; } - void setErrFunc(ptrdiff_t ef) noexcept { errfunc = ef; } -``` - -### Update call sites: -- `L->status` → `L->getStatus()` or `L->setStatus()` (~15 occurrences) -- `L->errorJmp` → `L->getErrorJmp()` (~5 occurrences) -- `L->errfunc` → `L->getErrFunc()` or `L->setErrFunc()` (~10 occurrences) - -### Build and test - ---- - -## Step 5: Hook and Debug Fields -**Fields**: `hook`, `hookmask`, `allowhook`, `oldpc`, `basehookcount`, `hookcount`, `transferinfo` - -### Add to lstate.h: -```cpp -private: - volatile lua_Hook hook; - volatile l_signalT hookmask; - lu_byte allowhook; - int oldpc; - int basehookcount; - int hookcount; - struct { - int ftransfer; - int ntransfer; - } transferinfo; - -public: - lua_Hook getHook() const noexcept { return hook; } - void setHook(lua_Hook h) noexcept { hook = h; } - - l_signalT getHookMask() const noexcept { return hookmask; } - void setHookMask(l_signalT hm) noexcept { hookmask = hm; } - - lu_byte getAllowHook() const noexcept { return allowhook; } - void setAllowHook(lu_byte ah) noexcept { allowhook = ah; } - - int getOldPC() const noexcept { return oldpc; } - void setOldPC(int pc) noexcept { oldpc = pc; } - - int getBaseHookCount() const noexcept { return basehookcount; } - void setBaseHookCount(int bhc) noexcept { basehookcount = bhc; } - - int getHookCount() const noexcept { return hookcount; } - void setHookCount(int hc) noexcept { hookcount = hc; } - int& getHookCountRef() noexcept { return hookcount; } // For decrement - - // TransferInfo accessors - return reference to allow field access - auto& getTransferInfo() noexcept { return transferinfo; } - const auto& getTransferInfo() const noexcept { return transferinfo; } -``` - -### Update call sites: -- `L->hook` → `L->getHook()` or `L->setHook()` (~6 occurrences) -- `L->hookmask` → `L->getHookMask()` or `L->setHookMask()` (~10 occurrences) -- `L->allowhook` → `L->getAllowHook()` or `L->setAllowHook()` (~8 occurrences) -- `L->transferinfo.ftransfer` → `L->getTransferInfo().ftransfer` (~2 occurrences) -- etc. - -### Build and test - ---- - -## Step 6: Call Counter Fields -**Fields**: `nCcalls`, `nci` - -### Add to lstate.h: -```cpp -private: - l_uint32 nCcalls; - int nci; - -public: - l_uint32 getNCcalls() const noexcept { return nCcalls; } - void setNCcalls(l_uint32 nc) noexcept { nCcalls = nc; } - l_uint32& getNCcallsRef() noexcept { return nCcalls; } // For increment/decrement - - int getNCI() const noexcept { return nci; } - void setNCI(int n) noexcept { nci = n; } - int& getNCIRef() noexcept { return nci; } // For increment/decrement - - // Non-yieldable call management (better names for incnny/decnny) - void incrementNonYieldable() noexcept { nCcalls += 0x10000; } - void decrementNonYieldable() noexcept { nCcalls -= 0x10000; } -``` - -### Update macros in lstate.h: -```cpp -#define yieldable(L) (((L)->getNCcalls() & 0xffff0000) == 0) -#define getCcalls(L) ((L)->getNCcalls() & 0xffff) - -// Replace with method calls -#define incnny(L) ((L)->incrementNonYieldable()) -#define decnny(L) ((L)->decrementNonYieldable()) -``` - -### Update call sites: -- `L->nCcalls` → `L->getNCcalls()` or `L->getNCcallsRef()` (~12 occurrences) -- `L->nci` → `L->getNCI()` or `L->setNCI()` (~5 occurrences) -- `incnny(L)` / `decnny(L)` already work via macros - -### Build and test - ---- - -## Step 7: Final Cleanup and Validation - -### Remove old macros (convert to inline functions): -```cpp -// Replace macros with inline functions -inline bool yieldable(const lua_State* L) noexcept { - return ((L->getNCcalls() & 0xffff0000) == 0); -} -inline l_uint32 getCcalls(const lua_State* L) noexcept { - return L->getNCcalls() & 0xffff; -} -inline void incnny(lua_State* L) noexcept { - L->incrementNonYieldable(); -} -inline void decnny(lua_State* L) noexcept { - L->decrementNonYieldable(); -} -``` - -### Final build and full test: -```bash -cmake --build build -cd testes && ../build/lua all.lua -``` - -### Performance check: -- Target: ≤2.50s (currently 2.46s) -- Should have zero overhead (all inline accessors) - ---- - -## Summary -**Total changes**: ~375 call sites across 20 files -**Approach**: Incremental field-by-field with compile check after each step -**Expected result**: 100% lua_State encapsulation with zero performance impact - -**Files with most changes** (tackle first): -1. lapi.cpp (~120 uses, mostly `L->top`) -2. lstate.cpp (~47 uses) -3. lvm.cpp (~48 uses) -4. ldo.cpp (~40 uses) -5. ldebug.cpp (~28 uses) diff --git a/docs/ROADMAP_2025_11_21.md b/docs/ROADMAP_2025_11_21.md deleted file mode 100644 index fd57b912..00000000 --- a/docs/ROADMAP_2025_11_21.md +++ /dev/null @@ -1,437 +0,0 @@ -# Project Roadmap & Status Update -**Date**: 2025-11-21 -**Current Branch**: `claude/update-docs-roadmap-01FonNVg47CwKQJaXpR6fmEt` -**Last Completed Phase**: 114 - ---- - -## Current Project Status - -### Performance Metrics -- **Current Benchmark**: 4.62s average (4.30-4.90s range over 3 runs) -- **Target**: ≤4.33s (≤3% from 4.20s baseline) -- **Status**: ⚠️ **SLIGHTLY ABOVE TARGET** (~9% over baseline) -- **Historical Baseline**: 2.17s (different hardware) - -### Code Quality Metrics -- **Coverage**: 96.1% lines, 92.7% functions, 85.2% branches ✅ -- **Build Status**: Zero warnings with `-Werror` ✅ -- **Tests**: All passing ✅ -- **CI/CD**: Active (Phase 101 complete) ✅ - -### Modernization Progress -- **Classes**: 19/19 fully encapsulated (100%) ✅ -- **Macros**: ~500 converted (~37% of convertible) -- **Enum Classes**: All major enums converted ✅ -- **CRTP**: Active across all 9 GC types ✅ -- **Exceptions**: Modern C++ (replaced setjmp/longjmp) ✅ - ---- - -## Recently Completed Phases (112-114) - -### Phase 112: Type Safety & std::span Integration (Multi-part) -**Status**: ✅ COMPLETE -**Commits**: f53cd37, 4533793, e5d33b0, 7ddb44e, 393c6ee - -**Phase 112.0**: std::span Accessors to Proto and ProtoDebugInfo -- Added `getCodeSpan()`, `getConstantsSpan()`, `getProtosSpan()`, `getUpvaluesSpan()` -- Added debug info span accessors (lineinfo, abslineinfo, locvars) -- Zero-cost abstraction: inline constexpr methods -- **Files**: `src/objects/lobject.h` - -**Phase 112.1**: Fix Clang Sign-Conversion Errors -- Fixed sign-conversion warnings in std::span accessors -- Ensured Clang 15+ compatibility -- **Files**: Multiple accessor methods - -**Phase 112 Part 1**: Operator Type Safety -- Converted `FuncState::prefix(int op)` → `prefix(UnOpr op)` -- Converted `FuncState::infix(int op)` → `infix(BinOpr op)` -- Converted `FuncState::posfix(int op)` → `posfix(BinOpr op)` -- **Impact**: Eliminated 6 redundant static_cast operations -- **Files**: `lparser.h`, `lcode.cpp`, `parser.cpp` - -**Phase 112 Part 2**: InstructionView Encapsulation -- Added opcode property methods to InstructionView: - - `getOpMode()`, `testAMode()`, `testTMode()` - - `testITMode()`, `testOTMode()`, `testMMMode()` -- Encapsulated `luaP_opmodes` array access -- **Impact**: Better encapsulation, cleaner code -- **Files**: `lopcodes.h`, `lopcodes.cpp`, `lcode.cpp`, `ldebug.cpp` - -**Phase 112 (Final)**: Loop Variable Type Optimization -- Optimized loop counter types for type safety -- **Performance**: 4.33s avg (exactly at target!) 🎯 - -**Overall Phase 112 Impact**: -- Type safety improved significantly -- std::span integration begun (Proto arrays) -- Operator handling modernized -- Performance: Within target - ---- - -### Phase 113: Boolean Predicates & Loop Modernization (Dual Focus) -**Status**: ✅ COMPLETE -**Commits**: c0b91a2, 56fa457 - -**Phase 113a**: Modernize Loops with C++ Standard Algorithms -- Converted traditional loops to range-based for loops -- Used C++20/23 algorithms where beneficial -- Improved code readability -- **Files**: Multiple (compiler, VM, GC modules) - -**Phase 113b**: Convert Internal Predicates to bool Return Type -- Converted 7 functions from `int` (0/1) to `bool`: - 1. `isKint()` - lcode.cpp - 2. `isCint()` - lcode.cpp - 3. `isSCint()` - lcode.cpp - 4. `isSCnumber()` - lcode.cpp - 5. `validop()` - lcode.cpp - 6. `testobjref1()` - ltests.cpp - 7. `testobjref()` - ltests.cpp -- **Impact**: Clearer intent, prevents arithmetic on booleans -- **Performance**: 4.73s avg (within normal variance) - -**Overall Phase 113 Impact**: -- Modern loop patterns adopted -- Type safety improved (bool vs int) -- Code clarity enhanced - ---- - -### Phase 114: NULL to nullptr Modernization -**Status**: ✅ COMPLETE -**Commit**: aa61f96 - -**Changes**: -- Replaced all C-style `NULL` macros with C++11 `nullptr` -- Improved type safety (nullptr has its own type) -- Modern C++ best practice -- **Files**: Codebase-wide (systematic replacement) - -**Impact**: -- Full C++11+ compliance -- Better type checking -- Prevents implicit conversions -- Zero performance impact - ---- - -## Unfinished/Incomplete Work Assessment - -### ✅ No Unfinished Phases Detected - -All recent phases (112-114) are complete and merged. No abandoned or partial work found. - -### ⚠️ Performance Slightly Above Target - -**Current**: 4.62s avg (target ≤4.33s) -**Variance**: 4.30-4.90s range indicates run-to-run variability -**Action**: Monitor; may need micro-optimizations if consistently above 4.33s - ---- - -## Planned/Documented Work (Not Yet Started) - -### From SPAN_MODERNIZATION_PLAN.md -**Status**: Phase 112 started span work, but plan shows 11 phases -**Completed**: Phase 1-2 (Foundation, Proto code/constants) -**Remaining**: Phases 3-11 (Proto nested protos, upvalues, debug info, Table, buffers, function params, strings, testing) - -**Assessment**: Partial implementation. Proto has span accessors, but: -- Table array span accessors not yet added -- Buffer (Mbuffer, Zio) span accessors not added -- Function parameter conversions not done -- TString span accessors not added - -**Recommendation**: Continue span adoption in Phase 115+ - ---- - -### From TYPE_MODERNIZATION_ANALYSIS.md -**Status**: Analysis complete, some work done - -**Completed**: -- ✅ Operator type safety (Phase 112) -- ✅ 7 boolean conversions (Phase 113) - -**Remaining High-Value Work**: -1. **8 Boolean Return Conversions** (2 hours, LOW risk) - - `iscleared()` - gc/gc_weak.cpp - - `hashkeyisempty()` - ltable.cpp - - `finishnodeset()`, `rawfinishnodeset()` - ltable.cpp - - `check_capture()` - lstrlib.cpp - - `isneg()` - lobject.cpp - - `checkbuffer()` - lzio.cpp - - `test2()` - liolib.cpp - -**Not Recommended** (per analysis): -- ❌ Loop counter conversions (int → size_t) - 400 instances, high effort, low value -- ❌ Size variable conversions - HIGH underflow risk -- ❌ Register index strong types - Very invasive -- ❌ Status code enum class - C API constraint -- ❌ Token type modernization - Current design works well - ---- - -### From LOOP_OPTIMIZATION_ANALYSIS.md -**Status**: Analysis complete, minimal action needed - -**Findings**: -- Overall loop quality: EXCELLENT ✅ -- 88% of loops are type-safe -- Only 1 hot-path micro-optimization identified - -**Recommended Actions** (Tier 1-2): -1. `lvm.cpp:808` - OP_LOADNIL loop optimization (micro-opt) -2. Explicit casts for clarity (type safety) -3. Document intentional patterns - -**Verdict**: Very low priority, already well-optimized - ---- - -### From NEXT_TASKS_RECOMMENDATIONS.md -**Status**: Tasks 1-3 partially complete - -**Completed**: -- ✅ Task #1: CI/CD Infrastructure (Phase 101) -- ✅ Task #2: Test Coverage Metrics (96.1% achieved) -- ⚠️ Task #3: Macro Conversions (500/~575 done = ~87% of target) - -**High Priority Remaining**: -- **GC Modularization Documentation** - Phase 101+ work undocumented -- **Static Analysis Integration** - clang-tidy, cppcheck, iwyu -- **Complete Remaining Macro Conversions** (~75 macros) - -**Medium Priority**: -- Performance profiling session -- Memory layout optimization -- Documentation enhancements - ---- - -## Recommendations for Next Phases - -### 🎯 Immediate Priorities (Phase 115-117) - -#### Phase 115: Complete Boolean Return Type Conversions ⭐⭐⭐ -**Effort**: 2 hours -**Risk**: LOW -**Value**: HIGH (completes modernization milestone) - -Convert remaining 8 functions: -```cpp -// gc/gc_weak.cpp -static bool iscleared(global_State* g, const GCObject* o); - -// ltable.cpp -static bool hashkeyisempty(Table* t, lua_Unsigned i); -static bool finishnodeset(Table* t, const TValue* key); -static bool rawfinishnodeset(Table* t, const TValue* key); - -// lstrlib.cpp -static bool check_capture(MatchState* ms, int l); - -// lobject.cpp -static bool isneg(const TValue* o); - -// lzio.cpp -static bool checkbuffer(Zio* z); - -// liolib.cpp -static bool test2(const char* s1, const char* s2); -``` - -**Success Criteria**: -- All internal predicates return bool -- Zero performance regression -- 15/15 boolean conversions complete - ---- - -#### Phase 116: Document GC Modularization Achievement ⭐⭐ -**Effort**: 2-3 hours -**Risk**: NONE (documentation only) -**Value**: HIGH (captures major work) - -**Deliverable**: `docs/GC_MODULARIZATION_SUMMARY.md` - -**Content**: -- Overview of extraction work (Phases 101+) -- Before/after metrics: - - lgc.cpp: 1,950 → 936 lines (52% reduction) - - 6 modules extracted: gc_core, gc_marking, gc_collector, gc_sweeping, gc_finalizer, gc_weak -- Architecture improvements -- Performance impact -- Benefits for maintainability - -**Why Important**: -- Similar to REFACTORING_SUMMARY.md (documents SRP work) -- Helps future contributors understand GC structure -- Completes the GC simplification story - ---- - -#### Phase 117: Continue std::span Adoption ⭐⭐ -**Effort**: 6-8 hours -**Risk**: LOW-MEDIUM -**Value**: MEDIUM-HIGH - -Follow SPAN_MODERNIZATION_PLAN.md Phases 3-11: - -**Phase 117a**: Table Array Span Accessors (3-4 hours) -- Add `getArraySpan()` to Table class -- Convert iteration over table arrays -- **CAREFUL**: Hot path, benchmark thoroughly - -**Phase 117b**: Buffer Span Accessors (2-3 hours) -- Add span accessors to Mbuffer and Zio -- Modernize buffer operations in lexer - -**Phase 117c**: TString Span Accessor (1 hour) -- Add `getStringSpan()` for const char* + length -- Consider `std::string_view` alternative - -**Success Criteria**: -- Zero performance regression (≤4.33s) -- Improved type safety -- Cleaner APIs - ---- - -### 🔍 Secondary Priorities (Phase 118-120) - -#### Phase 118: Static Analysis Integration ⭐⭐ -**Effort**: 3-4 hours -**Value**: MEDIUM-HIGH - -**Tools**: -- clang-tidy - Modern C++ best practices -- cppcheck - Additional static analysis -- include-what-you-use - Header optimization - -**Deliverables**: -- `.clang-tidy` configuration -- CI integration for automated checks -- Fix identified issues incrementally - ---- - -#### Phase 119: Complete Remaining Macro Conversions ⭐ -**Effort**: 8-10 hours total -**Value**: MEDIUM - -**Batches**: -1. lopcodes.h - Instruction manipulation (25 macros, 2-3 hours) -2. llimits.h - Utility macros (15 macros, 1-2 hours) -3. lctype.h - Character checks (10 macros, 1 hour) -4. Miscellaneous (15 macros, 2 hours) - -**Success Criteria**: -- All convertible macros → inline functions -- Zero performance regression -- Documented in CLAUDE.md - ---- - -#### Phase 120: Address Performance Regression ⭐⭐ -**Effort**: 4-6 hours (investigation + fixes) -**Value**: HIGH (back to target) - -**Current Issue**: 4.62s avg vs 4.33s target (7% over) - -**Approach**: -1. Profile with perf/cachegrind -2. Identify hot spots introduced in recent phases -3. Micro-optimize critical paths -4. Verify ≤4.33s achieved - -**Likely Causes**: -- std::span overhead in hot paths (unlikely but check) -- InstructionView method calls (should be inline) -- Loop modernization patterns - ---- - -### 📊 Long-Term Work (Phase 121+) - -#### Performance Optimization -- Full profiling session with perf/cachegrind -- Memory layout optimization (struct padding, cache lines) -- Link-time optimization (LTO) tuning -- Profile-guided optimization (PGO) - -#### Documentation & Polish -- Architecture diagrams (Mermaid) -- CONTRIBUTING.md guide -- Code cleanup sweep (`[[nodiscard]]`, `[[maybe_unused]]`) -- Doxygen documentation - -#### Advanced Modernization -- C++23 modules (when compiler support matures) -- Ranges library integration -- Coroutine integration for Lua coroutines -- std::expected for error handling - ---- - -## Completed Plan Documents (Mark as Historical) - -These plan documents are now complete and should be marked as historical reference: - -1. ✅ **ENCAPSULATION_PLAN.md** - Phases 37-42 complete -2. ✅ **CONSTRUCTOR_PLAN.md** - Phases 1-2 complete -3. ✅ **CONSTRUCTOR_REFACTOR_PLAN.md** - Constructor work complete -4. ✅ **LUASTACK_AGGRESSIVE_PLAN.md** - Phase 94 complete -5. ✅ **LUASTACK_ASSIGNMENT_PLAN.md** - Stack operations complete -6. ✅ **PHASE_36_2_PLAN.md** - Historical -7. ✅ **AGGRESSIVE_MACRO_ELIMINATION_PLAN.md** - Ongoing (37% done) -8. ⚠️ **SPAN_MODERNIZATION_PLAN.md** - Partially complete (Phases 1-2 done, 3-11 remain) - -**Action**: Add "✅ HISTORICAL - Completed [date]" header to completed plans - ---- - -## Summary: Next Steps - -### Immediate (This Week) -1. **Phase 115**: Complete 8 boolean return conversions (2 hours) -2. **Phase 116**: Document GC modularization (2-3 hours) -3. **Update CLAUDE.md**: Add Phases 112-114 documentation (1 hour) - -### Short-Term (Next 2 Weeks) -4. **Phase 117**: Continue std::span adoption (6-8 hours) -5. **Phase 118**: Static analysis integration (3-4 hours) -6. **Phase 120**: Address performance regression (4-6 hours) - -### Medium-Term (Next Month) -7. **Phase 119**: Complete macro conversions (8-10 hours) -8. Performance profiling and optimization -9. Documentation enhancements - ---- - -## Success Metrics - -### Phase 115-117 Goals -- ✅ 15/15 boolean conversions complete -- ✅ GC work documented -- ✅ std::span adoption progressed (Table, buffers, strings) -- ✅ Performance ≤4.33s restored -- ✅ CLAUDE.md updated with Phases 112-114 - -### Overall Project Health -- **Encapsulation**: 100% complete ✅ -- **Type Safety**: 90%+ (improving) -- **Performance**: Within 3% of baseline (target) -- **Code Quality**: 96%+ coverage, zero warnings ✅ -- **CI/CD**: Active and comprehensive ✅ - ---- - -**Last Updated**: 2025-11-21 -**Document Version**: 1.0 -**Status**: Active Roadmap diff --git a/docs/SPAN_MODERNIZATION_PLAN.md b/docs/SPAN_MODERNIZATION_PLAN.md index f470e0ab..961a7e72 100644 --- a/docs/SPAN_MODERNIZATION_PLAN.md +++ b/docs/SPAN_MODERNIZATION_PLAN.md @@ -1,9 +1,13 @@ # std::span Modernization Plan -**Date**: 2025-11-20 -**Status**: Planning Phase +**Date**: 2025-11-20 (Updated: 2025-11-21) +**Status**: ⚠️ **Partially Implemented** - Phases 115-116 completed, performance concerns identified **Target**: Zero-cost type safety improvements using C++20 std::span +> **Update (2025-11-21)**: Phases 115-116 completed std::span integration for Proto and Dyndata accessors. +> Performance regression identified (4.70s avg in Phase 115 Part 3), requiring optimization work. +> See CLAUDE.md for current status and Phase 116 completion details. + --- ## Executive Summary diff --git a/docs/STATUS_UPDATE_2025_11_21.md b/docs/STATUS_UPDATE_2025_11_21.md deleted file mode 100644 index bcd063c0..00000000 --- a/docs/STATUS_UPDATE_2025_11_21.md +++ /dev/null @@ -1,412 +0,0 @@ -# Project Status Update - November 21, 2025 - -**Generated**: 2025-11-21 -**Branch**: `claude/update-docs-roadmap-01FonNVg47CwKQJaXpR6fmEt` -**Purpose**: Comprehensive status assessment after Phases 112-114 - ---- - -## Executive Summary - -The Lua C++ Modernization Project has successfully completed **Phases 112-114**, achieving significant type safety improvements through std::span integration, operator type safety, InstructionView encapsulation, boolean predicate conversions, and nullptr modernization. - -### Overall Health: ⭐⭐⭐⭐ (EXCELLENT with minor performance concern) - -**Strengths**: -- ✅ 19/19 classes fully encapsulated (100%) -- ✅ Type safety significantly improved -- ✅ 96.1% code coverage -- ✅ CI/CD fully operational -- ✅ Zero build warnings -- ✅ All tests passing - -**Areas for Attention**: -- ⚠️ Performance slightly above target (4.62s avg vs 4.33s target) -- ⚠️ ~75 macros remain to be converted -- ⚠️ std::span adoption incomplete (only Proto/ProtoDebugInfo done) - ---- - -## Recent Completions (Phases 112-114) - -### Phase 112: Type Safety & std::span Integration ✅ - -**Multi-part phase addressing type safety across the codebase** - -#### Part 0: std::span Accessors -- Added span accessors to Proto: `getCodeSpan()`, `getConstantsSpan()`, `getProtosSpan()`, `getUpvaluesSpan()` -- Added span accessors to ProtoDebugInfo: lineinfo, abslineinfo, locvars -- Zero-cost abstraction: inline constexpr methods -- **Files**: `src/objects/lobject.h` - -#### Part 0.1: Clang Compatibility -- Fixed sign-conversion warnings in span accessors -- Ensured Clang 15+ compatibility -- Multi-compiler support maintained - -#### Part 1: Operator Type Safety -- Modernized `FuncState::prefix()`, `infix()`, `posfix()` to accept enum classes directly -- Changed from `int op` to `UnOpr op` / `BinOpr op` -- **Impact**: Eliminated 6 redundant static_cast operations -- **Benefit**: Compiler enforces valid operator values -- **Files**: `lparser.h`, `lcode.cpp`, `parser.cpp` - -#### Part 2: InstructionView Encapsulation -- Added property methods to InstructionView class: - - `getOpMode()` - Get instruction format mode - - `testAMode()`, `testTMode()`, `testITMode()`, `testOTMode()`, `testMMMode()` -- Encapsulated direct `luaP_opmodes` array access -- **Impact**: Better encapsulation, cleaner code -- **Files**: `lopcodes.h`, `lopcodes.cpp`, `lcode.cpp`, `ldebug.cpp` - -#### Phase 112 Performance -- **Result**: 4.33s avg - exactly at target! 🎯 -- Zero-cost abstractions validated - ---- - -### Phase 113: Boolean Predicates & Loop Modernization ✅ - -**Dual-focus phase: type safety + modern patterns** - -#### Part A: Loop Modernization -- Converted traditional loops to range-based for loops where appropriate -- Used C++20/23 algorithms for cleaner code -- **Impact**: Improved readability, modern patterns -- **Files**: Multiple (compiler, VM, GC modules) - -#### Part B: Boolean Return Types -- Converted 7 internal predicates from `int` (0/1) to `bool`: - 1. `isKint()` - Check if expression is literal integer (lcode.cpp) - 2. `isCint()` - Check if integer fits in register C (lcode.cpp) - 3. `isSCint()` - Check if integer fits in register sC (lcode.cpp) - 4. `isSCnumber()` - Check if number fits with output params (lcode.cpp) - 5. `validop()` - Validate constant folding operation (lcode.cpp) - 6. `testobjref1()` - Test GC object reference invariants (ltests.cpp) - 7. `testobjref()` - Wrapper that prints failed invariants (ltests.cpp) -- **Impact**: Clearer intent, prevents accidental arithmetic on booleans -- All return statements updated: `0 → false`, `1 → true` - -#### Phase 113 Performance -- **Result**: 4.73s avg - within normal variance -- No performance degradation from modernization - ---- - -### Phase 114: NULL to nullptr Modernization ✅ - -**Codebase-wide modernization to C++11 nullptr** - -#### Changes -- Systematic replacement of C-style `NULL` with `nullptr` -- **Scope**: All source files (src/, include/) -- **Impact**: - - Full C++11+ compliance - - Better type checking (nullptr has its own type) - - Prevents implicit conversions - - Modern C++ best practice - -#### Phase 114 Performance -- **Result**: Zero performance impact -- As expected for compile-time change - ---- - -## Current Metrics - -### Performance - -| Metric | Value | Target | Status | -|--------|-------|--------|--------| -| Average (3 runs) | 4.62s | ≤4.33s | ⚠️ 7% over | -| Range | 4.30-4.90s | - | High variance | -| Baseline | 4.20s | - | +10% | -| Historical (old HW) | 2.17s | - | Different machine | - -**Analysis**: Performance is above target. High variance (4.30-4.90s) suggests: -- System load variation -- Cache effects -- Possible micro-regression from recent changes - -**Recommendation**: Phase 120 should focus on: -1. Profiling to identify hot spots -2. Micro-optimizations in recent changes -3. Restore ≤4.33s performance - -### Code Quality - -| Metric | Value | Status | -|--------|-------|--------| -| Line Coverage | 96.1% | ✅ Excellent | -| Function Coverage | 92.7% | ✅ Excellent | -| Branch Coverage | 85.2% | ✅ Good | -| Build Warnings | 0 | ✅ Perfect | -| Test Suite | All passing | ✅ Perfect | -| CI/CD Status | Active | ✅ Operational | - -### Modernization Progress - -| Category | Progress | Status | -|----------|----------|--------| -| Class Encapsulation | 19/19 (100%) | ✅ Complete | -| Macro Conversion | ~500/~575 (~87%) | ⚠️ Ongoing | -| Enum Classes | All major (100%) | ✅ Complete | -| Cast Modernization | 100% | ✅ Complete | -| nullptr Adoption | 100% | ✅ Complete | -| std::span Adoption | ~20% | ⚠️ Incomplete | -| CRTP Implementation | 9/9 GC types | ✅ Complete | -| Exception Handling | 100% | ✅ Complete | - ---- - -## Documentation Updates - -### Files Modified - -1. **CLAUDE.md** ✅ - - Added Phases 112-114 documentation - - Updated "Recent Achievements" section - - Updated "Last Updated" footer - - Current phase set to 115+ - -2. **Plan Documents Marked Historical** ✅ - - `ENCAPSULATION_PLAN.md` - Marked complete - - `CONSTRUCTOR_PLAN.md` - Marked complete - - `LUASTACK_AGGRESSIVE_PLAN.md` - Marked complete - - `LUASTACK_ASSIGNMENT_PLAN.md` - Marked complete - - `AGGRESSIVE_MACRO_ELIMINATION_PLAN.md` - Marked ongoing - -3. **New Documentation Created** ✅ - - `ROADMAP_2025_11_21.md` - Comprehensive roadmap - - `STATUS_UPDATE_2025_11_21.md` - This document - ---- - -## Unfinished Work Assessment - -### ✅ No Abandoned Phases - -All recent phases (112-114) were successfully completed and merged. No partial or abandoned work detected. - -### ⚠️ Incomplete Initiatives - -#### 1. std::span Adoption (20% complete) -**Completed**: -- ✅ Proto code, constants, protos, upvalues arrays -- ✅ ProtoDebugInfo arrays (lineinfo, abslineinfo, locvars) - -**Remaining** (from SPAN_MODERNIZATION_PLAN.md): -- ❌ Table array span accessors -- ❌ Buffer (Mbuffer, Zio) span accessors -- ❌ Function parameter conversions (ptr+size → span) -- ❌ TString span accessors -- ❌ Comprehensive testing phase - -**Recommendation**: Continue in Phase 117 - ---- - -#### 2. Macro Conversion (~87% complete) -**Status**: ~500 converted, ~75 remain - -**Remaining Batches**: -- lopcodes.h - Instruction manipulation macros (~25 macros) -- llimits.h - Utility macros (~15 macros) -- lctype.h - Character type checks (~10 macros) -- Miscellaneous (~25 macros) - -**Recommendation**: Complete in Phase 119 - ---- - -#### 3. Boolean Return Type Conversions (47% complete) -**Status**: 7/15 done - -**Remaining Functions** (from TYPE_MODERNIZATION_ANALYSIS.md): -1. `iscleared()` - gc/gc_weak.cpp -2. `hashkeyisempty()` - ltable.cpp -3. `finishnodeset()` - ltable.cpp -4. `rawfinishnodeset()` - ltable.cpp -5. `check_capture()` - lstrlib.cpp -6. `isneg()` - lobject.cpp -7. `checkbuffer()` - lzio.cpp -8. `test2()` - liolib.cpp - -**Effort**: 2 hours -**Risk**: LOW -**Recommendation**: Complete in Phase 115 (highest priority) - ---- - -## Next Steps (Prioritized) - -### Immediate Priorities (This Week) - -#### 1. Phase 115: Complete Boolean Return Conversions ⭐⭐⭐ -**Effort**: 2 hours -**Risk**: LOW -**Value**: HIGH - -Convert remaining 8 predicates to bool return type. Completes boolean modernization milestone (15/15). - -**Success Criteria**: -- All internal predicates return bool -- Zero performance regression -- Tests passing - ---- - -#### 2. Phase 116: Document GC Modularization ⭐⭐ -**Effort**: 2-3 hours -**Risk**: NONE (documentation only) -**Value**: HIGH - -Create `GC_MODULARIZATION_SUMMARY.md` documenting the extraction of 6 GC modules. - -**Content**: -- Overview of Phase 101+ work -- Before/after metrics (lgc.cpp: 1,950 → 936 lines) -- Module descriptions -- Architecture improvements - ---- - -#### 3. Update Documentation ⭐ -**Effort**: 1 hour -**Risk**: NONE -**Value**: MEDIUM - -Ensure all documentation is current: -- ✅ CLAUDE.md updated -- ✅ Plan documents marked historical -- ✅ Roadmap created -- Pending: Performance baseline update if regression addressed - ---- - -### Short-Term (Next 2 Weeks) - -#### 4. Phase 117: Continue std::span Adoption ⭐⭐ -**Effort**: 6-8 hours -**Risk**: MEDIUM (Table hot path) -**Value**: MEDIUM-HIGH - -**Sub-phases**: -- 117a: Table array span accessors (3-4 hours) -- 117b: Buffer span accessors (2-3 hours) -- 117c: TString span accessor (1 hour) - -**Critical**: Benchmark thoroughly for Table changes (hot path). - ---- - -#### 5. Phase 118: Static Analysis Integration ⭐⭐ -**Effort**: 3-4 hours -**Risk**: LOW -**Value**: MEDIUM-HIGH - -Integrate static analysis tools: -- clang-tidy configuration -- cppcheck in CI -- include-what-you-use - -**Benefit**: Catch issues early, enforce modern C++ patterns. - ---- - -#### 6. Phase 120: Address Performance Regression ⭐⭐⭐ -**Effort**: 4-6 hours -**Risk**: LOW -**Value**: HIGH - -**Current**: 4.62s avg (target ≤4.33s) - -**Approach**: -1. Profile with perf/cachegrind -2. Identify hot spots from Phases 112-114 -3. Micro-optimize critical paths -4. Verify ≤4.33s achieved - ---- - -### Medium-Term (Next Month) - -#### 7. Phase 119: Complete Macro Conversions ⭐ -**Effort**: 8-10 hours -**Value**: MEDIUM - -Complete remaining ~75 macros in lopcodes.h, llimits.h, lctype.h. - ---- - -#### 8. Performance Profiling Session ⭐⭐ -**Effort**: 2-3 hours -**Value**: HIGH - -Deep profiling to understand performance characteristics: -- Top hot functions -- Cache miss patterns -- Branch mispredictions -- Optimization opportunities - -**Deliverable**: `PERFORMANCE_PROFILE_2025.md` - ---- - -## Action Items - -### For User -1. ✅ Review ROADMAP_2025_11_21.md for detailed next steps -2. ✅ Review this status update -3. ⚠️ Decide on performance optimization priority (Phase 120) -4. ✅ Approve proceeding with Phase 115 (boolean conversions) - -### For Project -1. ✅ CLAUDE.md updated with Phases 112-114 -2. ✅ Plan documents marked historical -3. ✅ Comprehensive roadmap created -4. Pending: Begin Phase 115 (boolean conversions) - ---- - -## Risk Assessment - -### Low Risk -- ✅ Boolean return conversions (Phase 115) -- ✅ Documentation tasks (Phase 116) -- ✅ Static analysis integration (Phase 118) -- ✅ Macro conversions (Phase 119) - -### Medium Risk -- ⚠️ Table std::span accessors (hot path) -- ⚠️ Performance optimization (requires careful profiling) - -### High Risk -- None identified - ---- - -## Summary - -The project is in **excellent shape** after completing Phases 112-114. Type safety has significantly improved through std::span integration, operator type safety, and nullptr modernization. The only concern is performance being slightly above target (4.62s vs 4.33s). - -### Key Achievements -- ✅ std::span adoption begun (Proto arrays) -- ✅ Operator type safety (enum classes directly) -- ✅ InstructionView encapsulation -- ✅ 7 boolean conversions -- ✅ nullptr modernization (100%) -- ✅ Documentation updated - -### Immediate Focus -1. **Phase 115**: Complete 8 boolean conversions (2 hours, LOW risk) -2. **Phase 116**: Document GC modularization (2-3 hours, NONE risk) -3. **Phase 120**: Restore performance to ≤4.33s (4-6 hours, LOW risk) - -**Overall Assessment**: Project is 99% modernized with strong foundations. Performance optimization and completing remaining std::span adoption are the main remaining work items. - ---- - -**Document Version**: 1.0 -**Next Review**: After Phase 115-116 completion -**Contact**: See CLAUDE.md for AI assistant guidelines diff --git a/docs/STRICT_ALIASING_AUDIT.md b/docs/STRICT_ALIASING_AUDIT.md deleted file mode 100644 index 1dc6f37a..00000000 --- a/docs/STRICT_ALIASING_AUDIT.md +++ /dev/null @@ -1,782 +0,0 @@ -# Lua C++ Strict Aliasing and Type Punning Audit Report - -**Date**: 2025-11-21 -**Codebase**: /home/user/lua_cpp -**Scope**: Very thorough examination of type punning and strict aliasing violations -**Files Analyzed**: 84 source files (headers + implementations) - ---- - -## EXECUTIVE SUMMARY - -This codebase has **multiple categories of potential strict aliasing violations** stemming from C's original design. While the Lua C implementation had these issues by design, the C++ conversion requires careful analysis because: - -1. **Union type punning with different active members** - The Value union -2. **Pointer arithmetic through char* followed by reinterpret_cast** -3. **Memory layout assumptions** for variable-size objects -4. **Overlay patterns** (TString contents, Table array hint) -5. **Template-based type conversions** in GCObject conversions - -Most violations are **technically allowed by C++17 standard** due to careful allocation and initialization patterns, but they rely on implementation details and are fragile to compiler optimizations. - ---- - -## SEVERITY CLASSIFICATION - -### Critical (MUST FIX) -- Issues that definitely violate C++ standard or cause undefined behavior - -### High (SHOULD FIX) -- Likely to cause compiler optimization issues or undefined behavior under different conditions - -### Medium (SHOULD REVIEW) -- Questionable patterns that work but rely on implementation details - -### Low (INFORMATIONAL) -- Patterns that are generally safe but deserve documentation - ---- - -## FINDINGS - -### 1. TValue Union Type Punning (MEDIUM SEVERITY) - -**Location**: `src/objects/ltvalue.h:41-49`, `src/objects/lobject.h:1378-1443` - -**Pattern**: -```cpp -typedef union Value { - GCObject *gc; /* collectable objects */ - void *p; /* light userdata */ - lua_CFunction f; /* light C functions */ - lua_Integer i; /* integer numbers */ - lua_Number n; /* float numbers */ - lu_byte ub; /* guard for uninitialized */ -} Value; - -// TValue stores one Value + one lu_byte type tag -class TValue { - Value value_; - lu_byte tt_; -}; - -// Field assignments like: -inline void TValue::setInt(lua_Integer i) noexcept { - value_.i = i; - tt_ = LUA_VNUMINT; -} - -inline void TValue::setFloat(lua_Number n) noexcept { - value_.n = n; - tt_ = LUA_VNUMFLT; -} -``` - -**Strict Aliasing Issue**: -- Accessing `value_.i` and `value_.n` as different union members violates strict aliasing if the compiler assumes they don't alias -- **However**: This is safe per C++17 §8.3 [class.mem] because union members can be read/written as long as only one is active -- The `tt_` field indicates which union member is active, acting as a discriminator - -**Safeguard**: -- Type tag (`tt_`) acts as discriminator -- Code always checks tag before accessing specific union member -- Macro/method guards like `ttisinteger(o)` check tag before reading - -**Risk Assessment**: -- **ACCEPTABLE** with current compiler flags (-O3) -- **RISKY** with aggressive whole-program optimization (LTO) -- Could break if compiler doesn't respect union semantics - -**Recommendation**: -- Add runtime assertions in debug builds to verify tag matches accessed field -- Consider using `std::variant` in C++17 (would provide better type safety) -- Document the invariant: "Never access union field unless tag matches" - ---- - -### 2. reinterpret_cast GCObject Conversions (MEDIUM SEVERITY) - -**Location**: Multiple files - -**Pattern 1 - GCBase conversions**: -```cpp -// src/objects/lobject.h:320-326 -template -GCObject* toGCObject() noexcept { - return reinterpret_cast(static_cast(this)); -} -``` - -**Analysis**: -- `static_cast(this)` is safe (just cast within inheritance hierarchy) -- `reinterpret_cast(...)` then converts to base class -- **Safe because**: Derived inherits from GCBase which inherits from GCObject -- Memory layout is identical to GCObject at start (no offset needed) -- Static assert in each derived class verifies memory layout: - ```cpp - static_assert(sizeof(GCObject) == offsetof(Table, flags)); - ``` - -**Pattern 2 - Generic GC type conversions**: -```cpp -// src/memory/lgc.h:167-173 -template -inline bool iswhite(const T* x) noexcept { - return reinterpret_cast(x)->isWhite(); -} -``` - -**Analysis**: -- Assumes `T*` points to object with GCObject at start -- No type checking - relies on caller ensuring correct type -- **Risk**: If passed wrong type, could read garbage as GC fields - -**Pattern 3 - Char pointer arithmetic + cast**: -```cpp -// src/memory/lgc.cpp:224-225 -char *p = cast_charp(luaM_newobject(L, novariant(tt), sz)); -GCObject *o = reinterpret_cast(p + offset); -``` - -**Analysis**: -- Allocates raw bytes with `luaM_newobject` -- Adds offset to get to actual object start -- Casts to GCObject* -- **Safe because**: Used in controlled allocation path -- **Relies on**: Caller providing correct offset - -**Risk Assessment**: **MEDIUM** -- Static layout guarantees make these safe in practice -- But no runtime type checking -- Compiler might reorder checks if not careful with const semantics - -**Recommendation**: -- Keep as-is for performance -- Add comprehensive tests for GC type conversions -- Document which types have compatible memory layouts -- Consider static_assert for each GC type: `static_assert(alignof(Type) == alignof(GCObject))` - ---- - -### 3. Stack Pointer Arithmetic and restore() (HIGH SEVERITY) - -**Location**: `src/core/lstack.h:118-125` - -**Pattern**: -```cpp -/* Convert stack pointer to offset from base */ -inline ptrdiff_t save(StkId pt) const noexcept { - return cast_charp(pt) - cast_charp(stack.p); -} - -/* Convert offset to stack pointer */ -inline StkId restore(ptrdiff_t n) const noexcept { - return reinterpret_cast(cast_charp(stack.p) + n); -} -``` - -**Where StkId is**: -```cpp -// src/objects/lobject.h:63 -typedef StackValue *StkId; - -// src/objects/lobject.h:52-59 -typedef union StackValue { - TValue val; - struct { - Value value_; - lu_byte tt_; - unsigned short delta; - } tbclist; -} StackValue; -``` - -**Strict Aliasing Issue**: -1. **save()**: - - Casts `StackValue*` to `char*` - - This is allowed (any pointer can be cast to char*) - - Subtraction of char pointers is well-defined - -2. **restore()**: - - Does: `cast_charp(stack.p) + n` → pointer arithmetic on char* - - Then: `reinterpret_cast(...)` → casts back to StackValue* - - **This is a round-trip conversion** - -**Analysis of Round-trip Conversion**: -``` -StackValue* → char* → (char* + offset) → StackValue* -``` - -- **Technically allowed** by C++17 when the result pointer points to the same object -- **However**: Compiler might not realize this -- **Optimization risk**: If compiler doesn't track this conversion, it might assume the StackValue* from restore() has no alias relationship with original - -**Problem Case**: -```cpp -StkId original = stack.p + 5; -ptrdiff_t offset = save(original); -// ... realloc happens ... -StkId restored = restore(offset); - -// If compiler doesn't realize restored == new_stack.p + 5, -// it might use cached assumptions about *original -``` - -**Risk Assessment**: **HIGH** -- Works fine on current compiler (no aggressive whole-program optimization) -- Could break with LTO enabled -- Depends on compiler recognizing pointer round-trip conversion - -**Recommendation** (IMPORTANT): -- **Replace with direct offset storage**: Don't convert to char* at all - ```cpp - // Better approach: - inline ptrdiff_t save(StkId pt) const noexcept { - return pt - stack.p; // Direct pointer arithmetic - } - - inline StkId restore(ptrdiff_t n) const noexcept { - return stack.p + n; - } - ``` -- **Why this is better**: - - No char* intermediate → compiler understands aliasing better - - Same performance (pointer arithmetic is cheap) - - Clearer intent - - Safer with LTO - ---- - -### 4. NodeArray Memory Layout Manipulation (HIGH SEVERITY) - -**Location**: `src/objects/ltable.cpp:81-136` - -**Pattern**: -```cpp -typedef union { - Node *lastfree; - char padding[offsetof(Limbox_aux, follows_pNode)]; -} Limbox; - -class NodeArray { -public: - static Node* allocate(lua_State* L, unsigned int n, bool withLastfree) { - if (withLastfree) { - size_t total = sizeof(Limbox) + n * sizeof(Node); - char* block = luaM_newblock(L, total); - Limbox* limbox = reinterpret_cast(block); - Node* nodeStart = reinterpret_cast(block + sizeof(Limbox)); - limbox->lastfree = nodeStart + n; - return nodeStart; - } - } - - static Node*& getLastFree(Node* nodeStart) { - Limbox* limbox = reinterpret_cast(nodeStart) - 1; - return limbox->lastfree; - } -}; -``` - -**Strict Aliasing Issues**: - -1. **Allocation phase**: - - Allocates `char* block` of size `sizeof(Limbox) + n * sizeof(Node)` - - `reinterpret_cast(block)` - cast char* to Limbox* - - `reinterpret_cast(block + sizeof(Limbox))` - cast char* to Node* - - **Is this safe?** - - C++17 §8.2.10: "Reinterpret_cast from one pointer type to another is valid if the memory contains an object of that type" - - **But here**: memory contains `Limbox` followed by `Node[]` - - When we cast the start to `Limbox*`, we're OK - - When we cast `block + sizeof(Limbox)` to `Node*`, we're accessing the Node[] part - - **SAFE**: Provided pointer arithmetic and casting are done on the correct underlying objects - -2. **Access phase (getLastFree)**: - - Takes `Node* nodeStart` - - Casts to `Limbox*` and subtracts 1 - - `reinterpret_cast(nodeStart) - 1` - - **This is pointer arithmetic on Limbox array**: Treats memory as if it's a Limbox array - - **Is this safe?** - - The -1 points back to the Limbox header before the nodes - - Conceptually valid: `[Limbox][Node...] ← nodeStart` - - **However**: Pointer arithmetic assumes same type, but we're casting a `Node*` to `Limbox*` then doing arithmetic - - This violates the assumption that a Node* points to nodes, not Limbox - -**Problem**: -```cpp -// The nodeStart pointer semantically points to a Node -// But we're treating its predecessor as a Limbox -Limbox* limbox = reinterpret_cast(nodeStart) - 1; -``` - -- Compiler might assume `nodeStart` never aliases a Limbox -- If compiler does bounds analysis, it might think going back 1 element is UB - -**Risk Assessment**: **HIGH** -- Comments acknowledge this is "pointer arithmetic on Limbox array for arithmetic purposes" -- Implementation is clever but fragile -- Could break with stricter aliasing analysis - -**Code Comments Acknowledge This**: -```cpp -// Safe per C++17 §8.7: pointer arithmetic within allocated block -// nodeStart points to element after Limbox, so (nodeStart - 1) conceptually -// points to the Limbox (treating the block as Limbox array for arithmetic purposes) -``` - -**Recommendation**: -- **Better approach**: Store Limbox pointer explicitly - ```cpp - struct NodeHeader { - Limbox limbox; - Node nodes[1]; // Flexible array member in C++ - }; - - static Node* allocate(...) { - if (withLastfree) { - NodeHeader* header = new (luaM_newblock(...)) NodeHeader; - header->limbox.lastfree = header->nodes + n; - return header->nodes; - } - } - - static Node*& getLastFree(Node* nodeStart) { - // nodeStart points to nodes[0], so -1 of the containing NodeHeader - NodeHeader* header = containing_record(nodeStart, NodeHeader, nodes); - return header->limbox.lastfree; - } - ``` - -- **Why this is better**: - - Explicit pointer relationship - - No pointer arithmetic tricks - - Clear memory layout - - Compiler understands the structure - ---- - -### 5. TString Short String Contents Overlay (MEDIUM SEVERITY) - -**Location**: `src/objects/lobject.h:445-456, 496-497`, `src/objects/lstring.cpp:229` - -**Pattern**: -```cpp -class TString : public GCBase { -private: - lu_byte extra; - ls_byte shrlen; - unsigned int hash; - union { - size_t lnglen; - TString *hnext; - } u; - char *contents; /* <- For LONG strings only */ - lua_Alloc falloc; /* <- For EXTERNAL strings only */ - void *ud; /* <- For EXTERNAL strings only */ - -public: - TString() noexcept { - // NOTE: contents, falloc, ud are NOT initialized! - // For short strings, they're overlay for string data - } - - // For short strings, the string data starts AFTER the u union - char* getContentsAddr() noexcept { - return cast_charp(this) + contentsOffset(); - } -}; -``` - -**Usage**: -```cpp -// src/lstring.cpp:229 -ts->setContents(cast_charp(ts) + tstringFallocOffset()); -``` - -**How It Works**: -- **Short strings**: Allocated with size = `contentsOffset() + strlen + 1` - - The `contents`, `falloc`, `ud` fields don't exist in memory - - Instead, string data is laid out after the TString fields - -- **Long strings**: Full allocation - - `contents` field points to external string data - - `falloc`/`ud` for custom deallocation - -**Memory Layout**: -``` -Short String: - [GCObject.next][GCObject.tt][GCObject.marked] - [extra][shrlen][hash][u union] - [string data starts here] ← getContentsAddr() points here - -Long String: - [GCObject.next][GCObject.tt][GCObject.marked] - [extra][shrlen][hash][u union] - [*contents][*falloc][*ud] ← actual pointers - [external string data somewhere else] -``` - -**Strict Aliasing Issue**: -- For short strings, we're treating the `contents`/`falloc`/`ud` memory region as char array -- These fields are `char*`, `lua_Alloc`, `void*` - different types -- Reading them as `char*` is type punning -- **However**: Constructor explicitly says "NOTE: contents, falloc, ud are NOT initialized" -- For short strings, these bytes just don't semantically exist - -**Risk Assessment**: **MEDIUM** -- **Safe in practice** because: - - Allocation size is computed correctly - - Type tag (`shrlen >= 0` for short) discriminates behavior - - No code tries to read these pointers for short strings -- **Fragile because**: - - Relies on sizeof() and field layout - - Compiler could theoretically rearrange fields - - If someone adds a vtable, layout breaks - -**Code Comments Show Awareness**: -```cpp -// Phase 50: Constructor - initializes only fields common to both short and long strings -// For short strings: only fields up to 'u' exist (contents/falloc/ud are overlay for string data) -// For long strings: all fields exist - -// Phase 50: Destructor - trivial (GC handles deallocation) -// MUST be empty (not = default) because for short strings, not all fields exist in memory! -~TString() noexcept {} -``` - -**Recommendation**: -- Keep as-is (performance-critical fast path) -- Add `static_assert` to verify field ordering - ```cpp - static_assert(offsetof(TString, contents) == TString::contentsOffset()); - ``` -- Document the variable-size layout explicitly -- Consider adding debug validation in luaS_* functions - ---- - -### 6. Table Array Hint Type Punning (LOW-MEDIUM SEVERITY) - -**Location**: `src/objects/lobject.h:1685-1707` - -**Pattern**: -```cpp -inline unsigned int* getLenHint() noexcept { - return static_cast(static_cast(array)); -} - -inline const unsigned int* getLenHint() const noexcept { - return static_cast(static_cast(array)); -} - -inline lu_byte* getArrayTag(lua_Unsigned k) noexcept { - return static_cast(static_cast(array)) + sizeof(unsigned) + k; -} - -inline const lu_byte* getArrayTag(lua_Unsigned k) const noexcept { - return static_cast(static_cast(array)) + sizeof(unsigned) + k; -} - -inline Value* getArrayVal(lua_Unsigned k) noexcept { - return array - 1 - k; -} -``` - -**Context**: -- Table's array part stores values in a special layout -- For arrays with "count" semantics, length hint is stored at the beginning -- The array pointer points past the header - -**Memory Layout**: -``` -Array storage: -[count (unsigned)][tag byte][tag byte]...[Value][Value]...[Value]... - ↑ getArrayTag(0) - ↑ getLenHint() points here - ↑ array points here (to first Value) -``` - -**Type Punning**: -- `array` is `Value*` -- We cast it to `unsigned int*` to read count -- We cast it to `lu_byte*` to read tags -- Type punning across Value, unsigned int, lu_byte - -**Safety Analysis**: -- Pointer is created by `luaM_newvector` on raw void* allocation -- The memory is allocated as raw bytes -- Casting void* → any type is allowed (void* is generic) -- **However**: Code goes `Value* → void* → unsigned int*` -- This double-cast violates strict aliasing if Value* was the original type - -**The Issue**: -```cpp -// If array comes from: Value* array = ... -// Then doing: unsigned int* = (unsigned int*)(void*)array -// Violates aliasing: compiler assumes Value* and unsigned int* don't alias -``` - -**Actually Safe Because**: -- The array is allocated as raw bytes via `luaM_newblock` -- Conversion is: `char* → Value*` (initial array setup) -- Then used as generic data store -- The double-cast through void* is just being defensive - -**Risk Assessment**: **LOW-MEDIUM** -- **Likely safe** because allocation is untyped -- **Notation is defensive** (void* cast ensures it's not aliasing-based optimization) -- Could be optimized by storing offset instead of pointer - -**Recommendation**: -- Keep as-is (well-documented pattern) -- Consider adding explicit documentation: - ```cpp - // Array storage layout (allocated as raw bytes, stored as untyped void*): - // [count:unsigned int][tags:lu_byte...][values:Value...] - // array points to first Value (after count and tags) - ``` - ---- - -### 7. Pointer to Integer Conversions (INFORMATIONAL) - -**Location**: `src/memory/llimits.h:88-102, 209-212` - -**Pattern**: -```cpp -#define L_P2I uintptr_t /* convert pointer to unsigned integer */ - -template -inline constexpr unsigned int point2uint(T* p) noexcept { - return cast_uint((L_P2I)(p) & std::numeric_limits::max()); -} -``` - -**Usage**: Hash computation for pointers (objects, tables, etc.) - -**Analysis**: -- Converting pointer → integer → truncated uint for hashing -- Uses `uintptr_t` (safe conversion per C++17) -- Truncation to 32-bit is intentional (hash collision acceptable) -- **Not a strict aliasing issue** (pointer value, not dereferencing) - -**Risk Assessment**: **LOW - INFORMATIONAL** -- Safe pattern for hashing -- Well-defined behavior - ---- - -### 8. UpVal::getLevel() Cast (LOW SEVERITY) - -**Location**: `src/objects/lobject.h:1250-1253` - -**Pattern**: -```cpp -StkId getLevel() const noexcept { - lua_assert(isOpen()); - return reinterpret_cast(v.p); -} -``` - -**Context**: -- `v.p` is `TValue*` (pointer to stack slot) for open upvalues -- Reinterprets as `StkId` (which is `StackValue*`) -- TValue and StackValue are different types with same size/layout - -**Analysis**: -- `StkId = StackValue*` -- `v.p = TValue*` (stored in union) -- `StackValue` is a union containing `TValue` as first member -- **Actually safe**: StackValue.val IS a TValue at offset 0 -- But relies on memory layout: `sizeof(TValue) == sizeof(StackValue.val)` - -**Risk Assessment**: **LOW** -- Safe because StackValue.val is TValue -- But cast is unnecessary (could use `(StackValue*)(v.p)` conceptually) -- Works in practice - -**Recommendation**: -- Safe to keep as-is -- Could add static_assert: - ```cpp - static_assert(offsetof(StackValue, val) == 0); - static_assert(sizeof(TValue) == sizeof(StackValue.val)); - ``` - ---- - -## SUMMARY TABLE - -| Issue | File | Severity | Type | Status | -|-------|------|----------|------|--------| -| TValue Union Type Punning | ltvalue.h, lobject.h | MEDIUM | Union discrimination | Safe with safeguards | -| GCBase Conversions | lobject.h, lgc.h | MEDIUM | reinterpret_cast | Safe with static layout | -| Stack restore() | lstack.h | HIGH | Pointer round-trip | Works but fragile | -| NodeArray Layout | ltable.cpp | HIGH | Pointer arithmetic trick | Works but complex | -| TString Overlay | lstring.cpp, lobject.h | MEDIUM | Variable-size object | Safe with careful allocation | -| Table Array Tags | lobject.h | LOW-MEDIUM | Type punning | Likely safe | -| Pointer Hashing | llimits.h | LOW | Pointer→int | Safe, intentional | -| UpVal getLevel() | lobject.h | LOW | Minor cast | Safe, unnecessary | - ---- - -## COMPILER OPTIMIZATION RISKS - -### Current Compiler (GCC/Clang without LTO) -- **Status**: All patterns work correctly -- **Why**: Conservative aliasing analysis doesn't assume aggressive optimizations - -### With Link Time Optimization (LTO) -- **Risk**: HIGH for patterns 3 (stack) and 4 (NodeArray) -- **Why**: LTO can track pointer conversions across translation units and make assumptions - -### With Full Program Optimization (-fwhole-program) -- **Risk**: MEDIUM for patterns 1 (TValue union) and 2 (GC conversions) -- **Why**: Could reorder checks or assume union members don't interfere - -### With Aggressive Inlining -- **Risk**: MEDIUM for pattern 7 (Table array) -- **Why**: Might inline pointer arithmetic and lose context - ---- - -## RECOMMENDATIONS (PRIORITY ORDER) - -### 1. FIX: Stack Pointer Arithmetic (HIGH PRIORITY) -**File**: `src/core/lstack.h:118-125` - -Replace char* intermediate with direct pointer arithmetic: -```cpp -inline ptrdiff_t save(StkId pt) const noexcept { - return pt - stack.p; // Direct, no char* conversion -} - -inline StkId restore(ptrdiff_t n) const noexcept { - return stack.p + n; -} -``` - -**Impact**: Safer with optimizing compilers, clearer intent -**Risk**: Very low - same operation, better expression -**Effort**: 10 minutes - ---- - -### 2. IMPROVE: NodeArray Implementation (HIGH PRIORITY) -**File**: `src/objects/ltable.cpp:105-136` - -Consider explicit structure instead of clever offset math: -```cpp -struct NodeHeader { - Limbox limbox; - Node nodes[1]; -}; - -static Node* allocate(lua_State* L, unsigned int n, bool withLastfree) { - if (withLastfree) { - size_t sz = sizeof(Limbox) + n * sizeof(Node); - NodeHeader* header = new (luaM_newblock(L, sz)) NodeHeader; - header->limbox.lastfree = header->nodes + n; - return header->nodes; - } - return luaM_newvector(L, n, Node); -} -``` - -**Impact**: Clearer intent, safer aliasing -**Risk**: Same memory layout, minimal performance impact -**Effort**: 30 minutes + testing - ---- - -### 3. ADD: Assertions for Union Discrimination (MEDIUM PRIORITY) -**Files**: `src/objects/ltvalue.h`, `src/objects/lobject.h` - -Add runtime checks in debug mode: -```cpp -inline lua_Number TValue::numberValue() const noexcept { - if (tt_ == LUA_VNUMINT) { - return static_cast(value_.i); - } else { - lua_assert(tt_ == LUA_VNUMFLT); - return value_.n; - } -} -``` - -**Impact**: Catch union misuse in debug builds -**Risk**: No runtime cost in release builds -**Effort**: 2 hours - ---- - -### 4. DOCUMENT: TString Variable-Size Layout (MEDIUM PRIORITY) -**File**: `src/objects/lobject.h:445-556` - -Add detailed comments and static_asserts: -```cpp -// Short string layout (allocated with exact size): -// [GCObject fields: 24 bytes] -// [extra: 1 byte][shrlen: 1 byte][hash: 4 bytes][union u: 8 bytes] -// [string data starts here: strlen+1 bytes] -// Long string layout (full size): -// [GCObject fields] -// [extra][shrlen][hash][u][*contents][*falloc][*ud] -``` - -**Impact**: Prevents accidental layout changes -**Risk**: None -**Effort**: 1 hour - ---- - -### 5. ADD: Static Layout Verification (LOW PRIORITY) -**Files**: Various class definitions - -Add compile-time assertions: -```cpp -static_assert(sizeof(TString) == offsetof(TString, contents) + sizeof(char*)); -static_assert(alignof(StackValue) == alignof(TValue)); -static_assert(offsetof(NodeHeader, nodes) == sizeof(Limbox)); -``` - -**Impact**: Prevent layout surprises from compiler changes -**Risk**: None -**Effort**: 2 hours - ---- - -## TESTING RECOMMENDATIONS - -1. **Strict Aliasing Test Suite** - - Create `test_aliasing.cpp` with scenarios exercising each pattern - - Run with `-Wstrict-aliasing=2` to catch warnings - - Verify behavior with UBSan - -2. **Layout Verification** - - Add `test_memory_layout.cpp` checking all sizeof() and offsetof() - - Run on multiple platforms/compilers - -3. **Compiler Variations** - - Test with GCC -O3, Clang -O3 - - Test with LTO enabled (`-flto`) - - Test with UBSan (`-fsanitize=undefined`) - - Test with AddressSanitizer (`-fsanitize=address`) - -4. **Integration Testing** - - Run full test suite with strict compiler flags - - Monitor for crashes under aggressive optimization - ---- - -## CONCLUSION - -The codebase inherits strict aliasing patterns from Lua C implementation. While most patterns work in practice, they're fragile and could break with: -- More aggressive compiler optimizations -- LTO becoming more sophisticated -- Port to new architecture with different alignment - -**Priority fixes** (stack restore, NodeArray) should be addressed in Phase 116. - -**Current status**: Compiles and runs correctly with test suite passing. No imminent issues expected with current compiler settings. Monitor closely if enabling LTO. - diff --git a/docs/TYPE_MODERNIZATION_ANALYSIS.md b/docs/TYPE_MODERNIZATION_ANALYSIS.md index 1cdaec82..a1725b55 100644 --- a/docs/TYPE_MODERNIZATION_ANALYSIS.md +++ b/docs/TYPE_MODERNIZATION_ANALYSIS.md @@ -1,17 +1,26 @@ # Type Modernization Analysis & Roadmap **Lua C++ Conversion Project** | Analysis Date: 2025-11-21 -**Phases 112-113 Complete** | Remaining Opportunities Assessed +**Phases 112-119 Complete** | Remaining Opportunities Assessed + +> **Update (2025-11-21)**: Through Phase 119, additional modernization work completed: +> - Phase 114: nullptr modernization (100%) +> - Phase 117: 5 more boolean predicate conversions (12 total) +> - Phase 119: std::array conversion (4 fixed arrays) +> See CLAUDE.md for complete details. --- ## Executive Summary -Comprehensive analysis of C legacy type usage identified **600+ modernization opportunities** across the codebase. Phases 112-113 successfully completed **high-value, low-risk** improvements: +Comprehensive analysis of C legacy type usage identified **600+ modernization opportunities** across the codebase. Phases 112-119 successfully completed **high-value, low-risk** improvements: - ✅ **Phase 112**: Operator type safety & InstructionView encapsulation (-6 casts) - ✅ **Phase 113**: Boolean return types (7 functions converted) +- ✅ **Phase 114**: nullptr modernization (100% codebase) +- ✅ **Phase 117**: Additional boolean conversions (5 functions) +- ✅ **Phase 119**: std::array conversion (4 fixed arrays) -**Key Finding**: Most remaining opportunities have **diminishing returns or high risk**. The project has achieved significant modernization - further work should be selective. +**Key Finding**: Most remaining opportunities have **diminishing returns or high risk**. The project has achieved ~99% modernization - further work should be highly selective. --- diff --git a/docs/UNION_REMOVAL_ANALYSIS.md b/docs/UNION_REMOVAL_ANALYSIS.md deleted file mode 100644 index 7bccfa33..00000000 --- a/docs/UNION_REMOVAL_ANALYSIS.md +++ /dev/null @@ -1,688 +0,0 @@ -# Union Removal Analysis - Lua C++ Conversion Project - -**Analysis Date**: 2025-11-17 -**Analyst**: Claude (Sonnet 4.5) -**Status**: Comprehensive analysis complete - ---- - -## Executive Summary - -The Lua C++ codebase currently uses **12 distinct unions** across core data structures. This analysis evaluates options for removing all unions and replacing them with modern C++ alternatives while maintaining: - -- ✅ **Zero performance regression** (target ≤4.33s, ≤3% from 4.20s baseline) -- ✅ **Zero-cost abstraction** (inline methods, compile-time optimization) -- ✅ **C API compatibility** (external interface unchanged) -- ✅ **Memory layout preservation** (for performance-critical structures) - -**Key Finding**: Most unions can be eliminated, but **3 critical unions should be retained** for zero-cost performance guarantees. - ---- - -## Union Inventory - -### 1. **Value Union** (CRITICAL - RETAIN) -**Location**: `src/objects/ltvalue.h:41` -**Size**: 8 bytes (pointer-sized) -**Usage**: Core tagged value representation - -```cpp -typedef union Value { - struct GCObject *gc; /* collectable objects */ - void *p; /* light userdata */ - lua_CFunction f; /* light C functions */ - lua_Integer i; /* integer numbers */ - lua_Number n; /* float numbers */ - lu_byte ub; /* padding/initialization */ -} Value; -``` - -**Analysis**: -- **Hot path**: Used in EVERY TValue operation (VM core) -- **Access pattern**: Type-punned access based on tag -- **Frequency**: Billions of accesses per benchmark run -- **Performance impact**: CRITICAL - any indirection adds overhead - -**Recommendation**: **RETAIN AS UNION** -- `std::variant` adds: - - Extra discriminator byte (already have tt_ tag) - - Visitor/get overhead (not zero-cost in hot path) - - Larger memory footprint (9-16 bytes vs 8 bytes) - - Cache line pollution (critical for TValue arrays) - -**Risk if removed**: 5-15% performance regression (unacceptable) - ---- - -### 2. **Closure Union** (MODERATE - CONSIDER std::variant) -**Location**: `src/objects/lobject.h:1296` -**Size**: Variable (max of CClosure/LClosure) -**Usage**: Polymorphic closure type - -```cpp -typedef union Closure { - CClosure c; - LClosure l; -} Closure; -``` - -**Analysis**: -- **Access pattern**: Type-checked via TValue tag before access -- **Frequency**: Moderate (function calls, not every instruction) -- **Current usage**: 47 references in codebase - -**Options**: - -**Option A: std::variant (Type-Safe)** -```cpp -using Closure = std::variant; - -// Access -if (auto* lcl = std::get_if(&closure)) { - // Use lcl -} -``` -**Pros**: Type safety, modern C++, no UB -**Cons**: Extra discriminator (already have tt_), visitor overhead, larger size - -**Option B: Base Class + Virtual (OOP)** -```cpp -class ClosureBase : public GCBase { - virtual ~ClosureBase() = default; - virtual bool isC() const = 0; -}; -class CClosure : public ClosureBase { /* ... */ }; -class LClosure : public ClosureBase { /* ... */ }; -``` -**Pros**: Clean OOP design -**Cons**: VTABLE overhead (8 bytes), virtual dispatch (slow), breaks zero-cost - -**Option C: Retain Union (Zero-Cost)** -```cpp -// Current approach - type safety via tt_ tag -typedef union Closure { - CClosure c; - LClosure l; -} Closure; -``` -**Pros**: Zero cost, compact, current working solution -**Cons**: Type-unsafe (but mitigated by tag checks) - -**Recommendation**: **RETAIN AS UNION** (Option C) -- Already have type tag (tt_) - no benefit from std::variant discriminator -- Function call overhead is NOT in the hot path like TValue -- Zero memory overhead, zero indirection -- Risk/reward unfavorable for modernization - -**Risk if changed**: 1-3% performance regression - ---- - -### 3. **TString::u Union** (LOW RISK - Can Replace) -**Location**: `src/objects/lobject.h:435` -**Size**: 8 bytes -**Usage**: Discriminated by shrlen sign (short vs long string) - -```cpp -union { - size_t lnglen; /* length for long strings */ - TString *hnext; /* linked list for hash table */ -} u; -``` - -**Analysis**: -- **Discriminator**: `shrlen >= 0` (short) vs `shrlen < 0` (long) -- **Frequency**: Moderate (string operations) -- **Type safety**: Current union is type-unsafe - -**Option A: std::variant** -```cpp -std::variant u; // lnglen or hnext - -// Access -size_t len = isLong() ? std::get(u) : getShortLen(); -TString* next = std::get(u); // for hash chain -``` -**Pros**: Type-safe, clear semantics -**Cons**: Extra byte for discriminator (redundant with shrlen) - -**Option B: Separate Classes (SRP)** -```cpp -class ShortString : public GCBase { - TString* hnext; // Hash chain - // No lnglen needed -}; - -class LongString : public GCBase { - size_t lnglen; // Length - // No hnext (not in hash table) -}; -``` -**Pros**: Perfect separation, clear types, SRP compliance -**Cons**: Requires significant refactoring of string subsystem - -**Recommendation**: **RETAIN AS UNION** (Low Priority for Change) -- Discriminator (shrlen) already exists - no safety gain from std::variant -- Separate classes would require major string subsystem refactoring -- Memory savings: 0 bytes (both options are 8 bytes) -- Performance: Neutral (access patterns identical) - -**Risk if changed**: <1% performance regression (acceptable if needed) - ---- - -### 4. **CallInfo::u Union** (MODERATE - RETAIN) -**Location**: `src/core/lstate.h:259` -**Size**: 24 bytes -**Usage**: Lua vs C function call context - -```cpp -union { - struct { /* only for Lua functions */ - const Instruction *savedpc; - volatile l_signalT trap; - int nextraargs; - } l; - struct { /* only for C functions */ - lua_KFunction k; - ptrdiff_t old_errfunc; - lua_KContext ctx; - } c; -} u; -``` - -**Analysis**: -- **Hot path**: Used in function calls/returns (VM core) -- **Discriminator**: `callstatus` bits -- **Frequency**: High (every function call) - -**Option: std::variant** -```cpp -struct LuaCallInfo { const Instruction* savedpc; l_signalT trap; int nextraargs; }; -struct CCallInfo { lua_KFunction k; ptrdiff_t old_errfunc; lua_KContext ctx; }; - -std::variant u; -``` -**Cons**: Extra discriminator (redundant), visitor overhead in hot path - -**Recommendation**: **RETAIN AS UNION** -- Function call/return is performance-critical -- Already discriminated by callstatus bits -- Memory layout matters (cache line alignment) - -**Risk if changed**: 2-5% performance regression - ---- - -### 5. **CallInfo::u2 Union** (LOW RISK - Can Replace) -**Location**: `src/core/lstate.h:271` -**Size**: 4 bytes -**Usage**: Multi-purpose integer field - -```cpp -union { - int funcidx; /* called-function index */ - int nyield; /* number of values yielded */ - int nres; /* number of values returned */ -} u2; -``` - -**Analysis**: -- **Pattern**: Different phases of call lifecycle -- **Type**: All same type (int) - no type safety benefit from std::variant - -**Option: Named Accessors** -```cpp -class CallInfo { -private: - int u2_value; // Single int, different interpretations - -public: - int getFuncIdx() const noexcept { return u2_value; } - void setFuncIdx(int idx) noexcept { u2_value = idx; } - - int getNYield() const noexcept { return u2_value; } - void setNYield(int n) noexcept { u2_value = n; } - - int getNRes() const noexcept { return u2_value; } - void setNRes(int n) noexcept { u2_value = n; } -}; -``` - -**Recommendation**: **REPLACE WITH NAMED ACCESSORS** -- Zero performance cost (inline accessors) -- Better documentation (clear when each variant is used) -- Same memory layout (4 bytes) -- No discriminator needed (lifecycle-based usage) - -**Risk if changed**: 0% performance regression (compile-time alias) - ---- - -### 6. **UpVal::v Union** (CRITICAL - RETAIN) -**Location**: `src/objects/lobject.h:1142` -**Size**: 8 bytes -**Usage**: Pointer vs offset during stack reallocation - -```cpp -union { - TValue *p; /* points to stack or to its own value */ - ptrdiff_t offset; /* used while the stack is being reallocated */ -} v; -``` - -**Analysis**: -- **Critical pattern**: Type-punning during GC/reallocation -- **Safety**: Temporary conversion during stack reallocation only -- **Frequency**: Every stack reallocation (medium frequency) - -**Recommendation**: **RETAIN AS UNION** -- Classic pointer-to-offset pattern during reallocation -- std::variant would add overhead for rare operation -- No safety benefit (usage is tightly controlled) - -**Risk if changed**: <1% performance regression - ---- - -### 7. **UpVal::u Union** (MODERATE - RETAIN) -**Location**: `src/objects/lobject.h:1146` -**Size**: 16 bytes (sizeof(TValue)) -**Usage**: Open vs closed upvalue state - -```cpp -union { - struct { /* (when open) */ - struct UpVal *next; - struct UpVal **previous; - } open; - TValue value; /* the value (when closed) */ -} u; -``` - -**Analysis**: -- **State machine**: Open (linked list) vs Closed (owns value) -- **Frequency**: Moderate (closure creation/GC) -- **Discriminator**: `v.p` pointing to `u.value` means closed - -**Recommendation**: **RETAIN AS UNION** -- Open/closed state is fundamental to upvalue semantics -- Discriminator is implicit (pointer equality check) -- std::variant adds no safety (state already tracked) - -**Risk if changed**: 1-2% performance regression - ---- - -### 8. **expdesc::u Union** (LOW RISK - Can Replace) -**Location**: `src/compiler/lparser.h:81` -**Size**: 16 bytes -**Usage**: Different expression types during parsing - -```cpp -union { - lua_Integer ival; /* for VKINT */ - lua_Number nval; /* for VKFLT */ - TString *strval; /* for VKSTR */ - int info; /* for generic use */ - struct { short idx; lu_byte t; lu_byte ro; int keystr; } ind; - struct { lu_byte ridx; short vidx; } var; -} u; -``` - -**Analysis**: -- **Context**: Compile-time only (not runtime hot path) -- **Discriminator**: `expkind k` field -- **Frequency**: Parsing phase only (not performance-critical) - -**Option: std::variant** -```cpp -struct IndexedVar { short idx; lu_byte t; lu_byte ro; int keystr; }; -struct LocalVar { lu_byte ridx; short vidx; }; - -std::variant u; -``` - -**Recommendation**: **REPLACE WITH std::variant** -- Compile-time only - no runtime performance impact -- Type safety benefit is HIGH (complex discriminated union) -- Modern C++ showcase (not performance-critical) -- Good SRP separation - -**Risk if changed**: 0% performance regression (compile-time only) - ---- - -### 9. **Vardesc Union** (LOW RISK - Can Restructure) -**Location**: `src/compiler/lparser.h:174` -**Size**: 32 bytes -**Usage**: Variable descriptor vs constant value - -```cpp -union { - struct { - Value value_; - lu_byte tt_; - lu_byte kind; - lu_byte ridx; - short pidx; - TString *name; - } vd; - TValue k; /* constant value (if any) */ -}; -``` - -**Analysis**: -- **Pattern**: Aliasing for TValue overlay -- **Context**: Compile-time only -- **Usage**: Confusing overlay pattern - -**Option: Composition** -```cpp -class Vardesc { -private: - Value value_; - lu_byte tt_; - lu_byte kind; - lu_byte ridx; - short pidx; - TString* name; - -public: - TValue* asTValue() noexcept { - return reinterpret_cast(&value_); - } - // Or use std::optional for constant -}; -``` - -**Recommendation**: **RESTRUCTURE** (Remove Union) -- Compile-time only - no performance concern -- Union is confusing (overlay pattern) -- Better: explicit conversion or std::optional - -**Risk if changed**: 0% performance regression - ---- - -### 10. **Node Union** (CRITICAL - RETAIN) -**Location**: `src/objects/lobject.h:1437` -**Size**: 32 bytes -**Usage**: Table node memory optimization - -```cpp -union { - struct { - Value value_; - lu_byte tt_; - lu_byte key_tt; - int next; - Value key_val; - } u; - TValue i_val; /* direct access to node's value as a proper 'TValue' */ -}; -``` - -**Analysis**: -- **Optimization**: Packed key/value without full TValue overhead -- **Hot path**: Table access (VM core operation) -- **Frequency**: EXTREMELY HIGH (hash table operations) - -**Recommendation**: **RETAIN AS UNION** -- Critical memory layout optimization for hash tables -- TValue overlay allows zero-cost value access -- Any change breaks memory layout optimization -- Performance-critical structure (VM hot path) - -**Risk if changed**: 3-10% performance regression (UNACCEPTABLE) - ---- - -### 11. **Udata0::bindata Union** (ALIGNMENT - RETAIN) -**Location**: `src/objects/lobject.h:742` -**Size**: Platform-dependent -**Usage**: Maximum alignment for userdata - -```cpp -union {LUAI_MAXALIGN;} bindata; -``` - -**Analysis**: -- **Purpose**: C99 alignment idiom (ensures maximum platform alignment) -- **Critical**: User data may contain any C type requiring strict alignment - -**Recommendation**: **REPLACE WITH alignas** -```cpp -alignas(std::max_align_t) char bindata[1]; -``` - -**Pros**: Modern C++11 alignment, clearer intent -**Cons**: None - -**Risk if changed**: 0% (C++11 alignas is equivalent) - ---- - -### 12. **luaL_Buffer::init Union** (ALIGNMENT - RETAIN or Replace) -**Location**: `src/auxiliary/lauxlib.h:192` -**Size**: LUAL_BUFFERSIZE (typically 512 bytes) -**Usage**: Alignment for initial buffer - -```cpp -union { - LUAI_MAXALIGN; /* ensure maximum alignment for buffer */ - char b[LUAL_BUFFERSIZE]; /* initial buffer */ -} init; -``` - -**Recommendation**: **REPLACE WITH alignas** -```cpp -alignas(std::max_align_t) char init[LUAL_BUFFERSIZE]; -``` - -**Risk if changed**: 0% (alignment preserved) - ---- - -## Summary of Recommendations - -| Union | Location | Recommendation | Risk | Priority | -|-------|----------|----------------|------|----------| -| **Value** | ltvalue.h:41 | **RETAIN** | HIGH (5-15%) | CRITICAL | -| **Closure** | lobject.h:1296 | **RETAIN** | MEDIUM (1-3%) | LOW | -| **TString::u** | lobject.h:435 | RETAIN | LOW (<1%) | LOW | -| **CallInfo::u** | lstate.h:259 | **RETAIN** | MEDIUM (2-5%) | MEDIUM | -| **CallInfo::u2** | lstate.h:271 | **REPLACE** (accessors) | NONE (0%) | **HIGH** | -| **UpVal::v** | lobject.h:1142 | **RETAIN** | LOW (<1%) | LOW | -| **UpVal::u** | lobject.h:1146 | RETAIN | LOW (1-2%) | LOW | -| **expdesc::u** | lparser.h:81 | **REPLACE** (std::variant) | NONE (0%) | **HIGH** | -| **Vardesc** | lparser.h:174 | **RESTRUCTURE** | NONE (0%) | **MEDIUM** | -| **Node** | lobject.h:1437 | **RETAIN** | HIGH (3-10%) | CRITICAL | -| **Udata0::bindata** | lobject.h:742 | **REPLACE** (alignas) | NONE (0%) | **HIGH** | -| **luaL_Buffer::init** | lauxlib.h:192 | **REPLACE** (alignas) | NONE (0%) | **MEDIUM** | - -**Total**: 12 unions → **5 replaceable**, **7 retain** - ---- - -## Implementation Roadmap - -### Phase 1: Zero-Risk Replacements (Immediate) -**Estimated Time**: 2-4 hours -**Performance Risk**: 0% - -1. ✅ **Udata0::bindata** → `alignas(std::max_align_t)` -2. ✅ **luaL_Buffer::init** → `alignas(std::max_align_t)` -3. ✅ **CallInfo::u2** → Named accessor methods - -**Changes**: -```cpp -// Before -union {LUAI_MAXALIGN;} bindata; - -// After -alignas(std::max_align_t) char bindata[1]; -``` - -### Phase 2: Compile-Time Replacements (Low Risk) -**Estimated Time**: 8-12 hours -**Performance Risk**: <0.5% - -4. ✅ **expdesc::u** → `std::variant` -5. ✅ **Vardesc** → Restructure (remove union) - -**Benefits**: -- Type safety in parser/compiler -- Modern C++ showcase -- No runtime performance impact - -### Phase 3: Evaluation (Optional - Defer) -**Estimated Time**: 20-40 hours (research + implementation) -**Performance Risk**: 1-5% - -- **TString::u** - Evaluate std::variant or separate ShortString/LongString classes -- **Closure** - Evaluate std::variant (type safety vs performance) -- **UpVal unions** - Evaluate state machine pattern - -**Constraints**: -- MUST benchmark after each change -- Revert if performance > 4.33s -- Document performance impact - -### Phase 4: DO NOT CHANGE (Performance-Critical) -**Rationale**: Zero-cost abstraction cannot be guaranteed - -- ❌ **Value union** - Core VM hot path (CRITICAL) -- ❌ **CallInfo::u** - Function call hot path -- ❌ **Node union** - Table hot path (CRITICAL) - ---- - -## Performance Analysis - -### Benchmark Method -```bash -cd /home/user/lua_cpp -cmake --build build - -# 5-run benchmark -cd testes -for i in 1 2 3 4 5; do \ - ../build/lua all.lua 2>&1 | grep "total time:"; \ -done - -# Target: ≤4.33s (≤3% regression from 4.20s baseline) -``` - -### Expected Impact by Phase - -| Phase | Changes | Expected Impact | Threshold | -|-------|---------|----------------|-----------| -| Phase 1 | Alignment unions | 0% (±0.01s) | 4.21s | -| Phase 2 | Compile-time unions | 0% (±0.02s) | 4.22s | -| Phase 3 | Runtime unions | 0-2% (+0.00-0.08s) | 4.33s | -| Phase 4 | Hot-path unions | ❌ NOT ALLOWED | ❌ | - ---- - -## C++ Alternatives Reference - -### std::variant (C++17) -```cpp -std::variant v = 42; - -// Type-safe access -if (auto* i = std::get_if(&v)) { - std::cout << *i; -} - -// Visitor pattern -std::visit([](auto&& arg) { - std::cout << arg; -}, v); -``` - -**Pros**: Type safety, modern C++, no UB -**Cons**: Extra discriminator, visitor overhead, larger size - -### alignas (C++11) -```cpp -alignas(16) char buffer[256]; // 16-byte aligned -alignas(std::max_align_t) char data[1024]; // Platform maximum -``` - -**Pros**: Clear intent, standard C++ -**Cons**: None (superior to union alignment idiom) - -### Named Accessors (Zero-Cost) -```cpp -class Example { -private: - int value; // Single storage - -public: - int asIndex() const noexcept { return value; } - int asCount() const noexcept { return value; } - void setIndex(int i) noexcept { value = i; } - void setCount(int c) noexcept { value = c; } -}; -``` - -**Pros**: Zero cost, self-documenting, type-safe -**Cons**: None (better than union for same-type variants) - ---- - -## Key Learnings - -1. **Not all unions are equal** - Hot-path unions must be retained for performance -2. **Discriminators matter** - If you already have a tag, std::variant adds overhead -3. **Alignment unions → alignas** - Always replace with modern C++ -4. **Same-type unions → accessors** - Zero-cost, better documentation -5. **Compile-time unions → std::variant** - Type safety with no runtime cost -6. **Memory layout is critical** - TValue/Node unions are fundamental optimizations - ---- - -## Risk Assessment - -### High Risk (DO NOT CHANGE) -- ❌ **Value union** - 5-15% regression risk -- ❌ **Node union** - 3-10% regression risk -- ❌ **CallInfo::u** - 2-5% regression risk - -### Medium Risk (DEFER) -- ⚠️ **Closure union** - 1-3% regression risk -- ⚠️ **TString::u** - <1% regression risk -- ⚠️ **UpVal unions** - 1-2% regression risk - -### Zero Risk (SAFE TO CHANGE) -- ✅ **Alignment unions** (Udata0, luaL_Buffer) - 0% risk -- ✅ **CallInfo::u2** - 0% risk (same-type union) -- ✅ **expdesc::u** - 0% risk (compile-time only) -- ✅ **Vardesc** - 0% risk (compile-time only) - ---- - -## Conclusion - -**Total unions**: 12 -**Removable (Phases 1-2)**: 5 (42%) -**Retain (Performance)**: 7 (58%) - -**Final Recommendation**: -1. ✅ **Phase 1-2**: Remove 5 unions (zero-risk modernization) -2. ⚠️ **Phase 3**: Optional evaluation of 4 medium-risk unions -3. ❌ **Phase 4**: NEVER change 3 critical hot-path unions - -**Success Criteria**: -- Performance ≤4.33s (≤3% regression) -- All tests passing ("final OK !!!") -- Zero C API breakage -- Improved type safety in non-critical paths - ---- - -**Document Status**: ✅ Analysis complete, ready for implementation -**Next Step**: Implement Phase 1 (zero-risk replacements) -**Estimated Total Time**: 10-16 hours (Phases 1-2), 30-56 hours (all phases) diff --git a/docs/claude.md b/docs/claude.md deleted file mode 100644 index 0603550f..00000000 --- a/docs/claude.md +++ /dev/null @@ -1,515 +0,0 @@ -# ⚠️ DOCUMENTATION MOVED - -**This file is outdated. Please refer to the updated documentation:** - -# → See [CLAUDE.md](CLAUDE.md) for current documentation ← - ---- - -## Quick Summary - -**Repository**: `/home/user/lua_cpp` -**Status**: **100% ENCAPSULATION COMPLETE** ✅ (All 19 classes fully encapsulated) - -For the comprehensive, up-to-date guide for AI assistants, see **[CLAUDE.md](CLAUDE.md)** - ---- - -## Current Status - -### Completed ✅ -- **19 structs → classes**: Table, TString, Proto, UpVal, CClosure, LClosure, Udata, lua_State, global_State, CallInfo, GCObject, TValue, FuncState, LexState, expdesc, LocVar, AbsLineInfo, Upvaldesc, stringtable -- **13 classes fully encapsulated (68%)** with private fields: LocVar, AbsLineInfo, Upvaldesc, stringtable, GCObject, TString, Table, Proto, UpVal, CClosure, LClosure, CallInfo, expdesc -- **~500 macros converted** to inline functions/methods (37% of total convertible) -- **CRTP inheritance active** - GCBase for all GC objects -- **CommonHeader eliminated** - Pure C++ inheritance -- **C++ exceptions** - Replaced setjmp/longjmp -- **Modern CMake** - Build system -- **Organized source tree** - Logical subdirectories -- **Zero warnings** - Compiles with -Werror - -### In Progress 🔄 -- **Encapsulation Phases 37-42**: FuncState, LexState, Udata, Udata0, global_State, lua_State -- **Macro Conversion**: ~75 remaining convertible macros identified - ---- - -## Performance Requirements - -### Critical Constraint -**ZERO regression tolerance** - Strict performance enforcement: -- Target: ≤2.21s (≤1% from baseline 2.17s) -- Current: **2.14s ✓ (3% faster!)** -- Must benchmark after EVERY change -- Revert immediately if regression detected - -### Benchmark Command -```bash -cd /home/peter/claude/lua -make -C build - -# 5-run benchmark -cd testes -for i in 1 2 3 4 5; do ../build/lua all.lua 2>&1 | grep "total time:"; done -``` - ---- - -## Architecture Decisions - -### 1. CRTP (Curiously Recurring Template Pattern) - ACTIVE ✅ - -Static polymorphism without vtable overhead: - -```cpp -template -class GCBase { -public: - GCObject* next; - lu_byte tt; - lu_byte marked; - - bool isWhite() const noexcept { return testbits(marked, WHITEBITS); } - bool isBlack() const noexcept { return testbit(marked, BLACKBIT); } - lu_byte getAge() const noexcept { return getbits(marked, AGEBITS); } -}; - -class Table : public GCBase
{ /* ... */ }; -class TString : public GCBase { /* ... */ }; -``` - -All 9 GC-managed classes inherit from GCBase. - -### 2. Class Conversion Pattern - -```cpp -// Pure C++ - no conditional compilation -class Table : public GCBase
{ -private: - lu_byte flags; - unsigned int asize; - Value *array; - Node *node; - Table *metatable; - GCObject *gclist; - -public: - // Inline accessors - inline unsigned int arraySize() const noexcept { return asize; } - - // Methods - lu_byte get(const TValue* key, TValue* res); - void set(lua_State* L, const TValue* key, TValue* value); -}; -``` - -### 3. Exception Handling - -Modern C++ exceptions replaced setjmp/longjmp: - -```cpp -class LuaException : public std::exception { - int status_; -public: - explicit LuaException(int status) : status_(status) {} - int getStatus() const { return status_; } -}; -``` - -### 4. Zero-Cost Forwarding - -Methods forward to existing C functions for compatibility: - -```cpp -lu_byte Table::get(const TValue* key, TValue* res) { - return luaH_get(this, key, res); -} - -// C function wrapper for API compatibility -inline lu_byte luaH_get(Table *t, const TValue *key, TValue *res) { - return t->get(key, res); -} -``` - ---- - -## Codebase Structure - -### Directory Organization -``` -src/ -├── objects/ - Core data types (Table, TString, Proto, UpVal) -├── core/ - VM core (ldo, lapi, ldebug, lstate) -├── vm/ - Bytecode interpreter (lvm) -├── compiler/ - Parser and code generator (lparser, lcode) -├── memory/ - GC and memory management (lgc) -├── libraries/ - Standard libraries -├── auxiliary/ - Auxiliary library -├── serialization/ - Bytecode dump/undump -├── interpreter/ - Interactive interpreter -└── testing/ - Test infrastructure -``` - -### Module Organization -| Module | Prefix | Primary Class | Status | -|--------|--------|---------------|--------| -| Table | luaH_ | Table | ✅ Fully encapsulated | -| String | luaS_ | TString | ✅ Fully encapsulated | -| Object | luaO_ | TValue, GCObject | ✅ Fully encapsulated | -| Func | luaF_ | Proto, UpVal, Closures | ✅ Proto/UpVal encapsulated | -| Do | luaD_ | CallInfo | ✅ Class with methods | -| State | luaE_ | lua_State, global_State | ✅ Class with methods | -| GC | luaC_ | GCObject | ✅ Fully encapsulated | - ---- - -## Testing & Validation - -### Test Suite -**Location**: `/home/peter/claude/lua/testes/all.lua` -**Expected output**: `final OK !!!` - -### Build Commands -```bash -# Build -cd /home/peter/claude/lua -make -C build - -# Full rebuild -make -C build clean && make -C build - -# Run tests -cd testes -../build/lua all.lua -``` - -### Performance Validation -```bash -cd /home/peter/claude/lua/testes - -# 5-run benchmark -for i in 1 2 3 4 5; do \ - ../build/lua all.lua 2>&1 | grep "total time:"; \ -done - -# Target: ≤2.21s -# Current: ~2.14s ✓ -``` - ---- - -## Code Style & Conventions - -### Naming -- **Classes**: PascalCase (Table, TString) -- **Methods**: camelCase (get, arraySize) -- **Members**: snake_case (asize, lsizenode) -- **Constants**: UPPER_SNAKE_CASE (LUA_TNIL) - -### Const-Correctness -```cpp -// Read-only -inline bool isDummy() const noexcept { return ...; } -lu_byte get(const TValue* key, TValue* res) const; - -// Mutating -void set(lua_State* L, const TValue* key, TValue* value); -void resize(lua_State* L, unsigned nasize, unsigned nhsize); -``` - -### Inline Strategy -- Field accessors: inline -- Simple computations: inline constexpr -- Forwarding functions: inline -- Complex logic: separate .cpp implementation - ---- - -## Important Files - -### Core Headers -- `include/lua.h` - Public C API (C-compatible) -- `src/objects/lobject.h` - Core type definitions -- `src/objects/ltvalue.h` - TValue class -- `src/core/lstate.h` - VM state -- `src/memory/lgc.h` - GC with GCBase CRTP - -### Implementation Files -- `src/objects/ltable.cpp` - Table methods -- `src/objects/lstring.cpp` - TString methods -- `src/objects/lfunc.cpp` - Proto, UpVal, Closure methods -- `src/memory/lgc.cpp` - GC implementation -- `src/vm/lvm.cpp` - VM bytecode interpreter (hot path) -- `src/core/ldo.cpp` - lua_State methods - -### Build Files -- `CMakeLists.txt` - CMake configuration -- `build/` - Out-of-tree build directory - ---- - -## Common Patterns - -### Pattern 1: Struct → Class -```cpp -class StructName : public GCBase { -private: - // All fields private - -public: - // Inline accessors - inline type accessorName() const noexcept { return field; } - - // Methods - void methodName(params); -}; -``` - -### Pattern 2: Inline Constexpr Replacement -```cpp -// Before -#define ttisnil(v) (ttype(v) == LUA_TNIL) - -// After -inline constexpr bool ttisnil(const TValue* v) noexcept { - return ttype(v) == LUA_TNIL; -} -``` - ---- - -## Key Learnings - -1. **Inline functions are zero-cost** - No measurable overhead vs macros -2. **C++ can be faster** - 2.14s vs 2.17s baseline -3. **CRTP is zero-cost** - Static dispatch without vtables -4. **Encapsulation doesn't hurt performance** - Same compiled code -5. **Exceptions are efficient** - Faster than setjmp/longjmp -6. **Incremental conversion works** - Small phases with frequent testing - ---- - -## Analysis Findings - -### Project Assessment: EXCELLENT -- **Architecture**: Well-designed CRTP pattern with zero-cost abstraction -- **Performance**: 3% improvement over baseline (2.14s vs 2.17s) -- **Code Quality**: Zero warnings, 915 noexcept specifications, modern C++23 -- **Documentation**: Comprehensive plans (ENCAPSULATION_PLAN.md, CONSTRUCTOR_PLAN.md) -- **Technical Debt**: LOW-MEDIUM (primarily incomplete encapsulation) - -### Strengths -1. ✅ **Zero-cost modernization** - Performance improved, not degraded -2. ✅ **Type safety** - enum classes, inline constexpr, template functions -3. ✅ **Strong discipline** - 1% regression tolerance enforced -4. ✅ **Comprehensive testing** - 30+ test files in testes/ -5. ✅ **Modern build system** - CMake with sanitizers, LTO, CTest integration - -### Key Gaps -1. ⚠️ **68% encapsulation** - 6 classes remaining (plan exists) -2. ⚠️ **Unknown test coverage** - Need gcov/lcov integration -3. ⚠️ **~75 convertible macros** - Simple expression macros remain -4. ⚠️ **Header complexity** - Some circular dependencies - -### Achievements -- **Converted ~500 macros** to inline constexpr functions -- **CRTP implementation** across all 9 GC types -- **Performance improvement** despite adding type safety -- **Zero API breakage** - Full C compatibility maintained - ---- - -## Remaining Work - -### Encapsulation (Phases 37-42) - -**Phase 37: Udata0 Encapsulation** -- Risk: TRIVIAL | Time: 30 mins | Call Sites: ~5 -- Status: Has constructor, just needs field verification - -**Phase 38: Udata Encapsulation** -- Risk: LOW | Time: 1-2 hours | Call Sites: 10-20 -- Files: lstring.cpp, lgc.cpp, lapi.cpp -- Has 9 accessors, needs 3 more (setLen, setNumUserValues, pointer accessors) - -**Phase 39: FuncState Encapsulation** -- Risk: MEDIUM | Time: 2-3 hours | Call Sites: ~50-100 -- Files: lcode.cpp, lparser.cpp -- Has 6 accessors, needs comprehensive encapsulation (20+ fields) - -**Phase 40: LexState Encapsulation** -- Risk: MEDIUM | Time: 2-3 hours | Call Sites: ~50-100 -- Files: llex.cpp, lparser.cpp -- Has 4 accessors, needs comprehensive encapsulation (11 fields) - -**Phase 41: global_State Encapsulation** -- Risk: HIGH | Time: 4-6 hours | Call Sites: 100+ -- Status: **Fields already private!** Just needs verification -- Has ~100 accessors already implemented -- Strategy: Batched updates by module - -**Phase 42: lua_State Encapsulation** -- Risk: EXTREME | Time: 1 week | Call Sites: 200-300+ -- Status: **Fields already private!** Just needs verification -- Hot path: VM interpreter, call/return handling -- Strategy: Ultra-conservative batching with micro-benchmarks - -### Macro Conversion (~75 macros) - -**Batch 1**: 10 simple expression macros (lcode.h) - 1 hour -**Batch 2**: 25 instruction manipulation macros (lopcodes.h) - 2-3 hours -**Batch 3**: 15 type check macros (ltm.h) - 1-2 hours -**Batch 4**: 10 character type macros (lctype.h) - 1 hour -**Batch 5**: 15 remaining simple macros - 2 hours - -**Total**: 75 macros, ~8-10 hours - ---- - -## Macro Conversion Guidelines - -### Convertible Macros (Convert These) - -**Simple Expressions** - High Priority: -```cpp -// Before -#define lmod(s,size) (check_exp((size&(size-1))==0, (cast_uint(s) & cast_uint((size)-1)))) - -// After -inline constexpr unsigned int lmod(int s, int size) noexcept { - return (size & (size-1)) == 0 ? (cast_uint(s) & cast_uint(size-1)) : 0; -} -``` - -**Type Checks** - Medium Priority: -```cpp -// Before -#define isreserved(s) ((s)->tt == LUA_VSHRSTR && (s)->extra > 0) - -// After -inline bool isreserved(const TString* s) noexcept { - return s->tt == LUA_VSHRSTR && s->extra > 0; -} -``` - -**Instruction Manipulation** - High Priority (VM critical): -```cpp -// Before -#define GETARG_A(i) getarg(i, POS_A, SIZE_A) -#define SETARG_A(i,v) setarg(i, v, POS_A, SIZE_A) - -// After -inline constexpr int GETARG_A(Instruction i) noexcept { - return getarg(i, POS_A, SIZE_A); -} -inline void SETARG_A(Instruction& i, int v) noexcept { - setarg(i, v, POS_A, SIZE_A); -} -``` - -### Keep as Macros (Do NOT Convert) - -**Token-Pasting Macros**: -```cpp -// MUST remain macro - uses token pasting (##) -#define setgcparam(g,p,v) (g->gc##p = (v)) -#define applygcparam(g,p,x) (g->gc##p = applymul100(g->gc##p, x)) -``` - -**Public API Macros** (C compatibility): -```cpp -// MUST remain macro - part of public C API -#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL) -#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL) -``` - -**Hot Path Complex Macros**: -```cpp -// Keep as macro - used in VM interpreter hot path -#define luaH_fastgeti(t,k,res,tag) /* ... complex multi-line ... */ -``` - -**Configuration Macros**: -```cpp -// Keep as macro - compile-time configuration -#define LUAI_MAXSHORTLEN 40 -#define LUA_IDSIZE 60 -``` - -### Conversion Strategy - -1. **Identify candidates** - Use grep to find macro definitions -2. **Batch by header** - Convert 10-20 macros at a time -3. **Preserve semantics** - Ensure exact same behavior -4. **Use constexpr** - For compile-time computation -5. **Add noexcept** - For exception safety -6. **Benchmark** - After every batch -7. **Revert if regression** - Performance > 2.21s - -### Priority Order - -1. **lcode.h** - Simple compiler helpers (10 macros) -2. **lopcodes.h** - Instruction manipulation (25 macros) -3. **ltm.h** - Type method helpers (15 macros) -4. **lctype.h** - Character type checks (10 macros) -5. **Remaining** - Miscellaneous simple macros (15 macros) - -**Avoid**: lobject.h (complex), lgc.h (has token-pasting), lua.h/lauxlib.h (public API) - ---- - -## Process Rules (CRITICAL) - -1. **ASK before benchmarks** - Never run without permission -2. **NO automation scripts** - Use Edit/Read/Write tools only -3. **Manual editing** - No Python/shell scripts for code changes -4. **Incremental changes** - Test and benchmark after every phase -5. **Immediate revert** - If performance > 2.21s - -### Architecture Rules -1. **C compatibility ONLY for public API** (lua.h, lauxlib.h, lualib.h) -2. **Internal code is pure C++** - No `#ifdef __cplusplus` -3. **Performance target**: ≤2.21s (strict) -4. **Zero C API breakage** - Public interface unchanged - ---- - -## Quick Reference - -```bash -# Build -make -C build - -# Test -cd testes && ../build/lua all.lua - -# Benchmark -for i in 1 2 3 4 5; do ../build/lua all.lua 2>&1 | grep "total time:"; done - -# Git -git status -git log --oneline -5 -git add files && git commit -m "Phase N: Description" -``` - ---- - -## Success Metrics - -- ✅ 19 structs → classes (100%) -- ⏳ 13/19 classes fully encapsulated (68%) - 6 remaining -- ⏳ ~500 macros converted (37%) - 75 convertible macros remaining -- ✅ CRTP active - All 9 GC types -- ✅ Exceptions implemented -- ✅ CMake build system -- ✅ Zero warnings (-Werror) -- ✅ Performance: 2.14s (3% better than baseline!) -- ✅ All tests passing -- ✅ Zero C API breakage - -**Status**: Major architectural modernization complete with performance improvement ✅ -**Next**: Complete remaining encapsulation phases and macro conversion - ---- - -**Last Updated**: Analysis and documentation update - Ready for Phases 37-42 and macro conversion diff --git a/docs/lambda_performance_analysis.md b/docs/lambda_performance_analysis.md deleted file mode 100644 index fd5014a5..00000000 --- a/docs/lambda_performance_analysis.md +++ /dev/null @@ -1,244 +0,0 @@ -# Lambda Performance Analysis: Why Lambdas Match or Exceed Macros - -**Date:** 2025-11-17 -**Finding:** Lambda version is 2.1% FASTER than macro version (4.305s vs 4.398s) - ---- - -## Side-by-Side Benchmark Results - -**Methodology:** Interleaved execution to eliminate system load variance -- 10 iterations, alternating MACRO → LAMBDA on same system state -- Both binaries built with same flags (GCC 13.3.0, -O3, C++23) - -**Results:** - -| Iteration | MACRO | LAMBDA | Winner | -|-----------|-------|--------|--------| -| 1 | 3.91s | 3.94s | MACRO (+0.03s) | -| 2 | 4.59s | 4.17s | LAMBDA (+0.42s) | -| 3 | 4.45s | 4.31s | LAMBDA (+0.14s) | -| 4 | 4.12s | 4.25s | MACRO (+0.13s) | -| 5 | 4.86s | 4.56s | LAMBDA (+0.30s) | -| 6 | 4.83s | 4.81s | LAMBDA (+0.02s) | -| 7 | 4.25s | 4.25s | TIE (0.00s) | -| 8 | 4.67s | 4.17s | LAMBDA (+0.50s) | -| 9 | 4.16s | 4.23s | MACRO (+0.07s) | -| 10 | 4.14s | 4.36s | MACRO (+0.22s) | - -**Averages:** -- **MACRO:** 4.398s -- **LAMBDA:** 4.305s -- **Difference:** -0.093s (-2.1% - lambda is FASTER!) - -**Win/Loss:** -- LAMBDA wins: 6/10 iterations -- MACRO wins: 3/10 iterations -- TIE: 1/10 iteration - ---- - -## Analysis: Why Lambdas Are As Fast or Faster - -### 1. Excellent Compiler Optimization (GCC 13.3.0) - -Modern compilers are VERY good at optimizing lambdas: - -**Inlining:** -- Both macros and lambdas inline completely in hot paths -- GCC's inliner treats lambdas identically to inline functions -- `-O3` optimization ensures aggressive inlining - -**Evidence:** -- Zero performance degradation shows perfect inlining -- Small performance gain suggests compiler found additional optimizations - -### 2. Parameter Passing: `auto` (by-value) - -Lambda parameters use `auto` (by-value capture of function pointers): - -```cpp -auto op_arithI = [&](auto iop, auto fop, Instruction i) { - // iop and fop are captured by value - // Compiler knows exact types at instantiation -} -``` - -**Benefits:** -- Function pointer values copied into lambda closure -- No indirection through references -- Compiler can optimize based on concrete types -- Better alias analysis (no pointer aliasing concerns) - -**Macro equivalent:** -```cpp -#define op_arithI(L,iop,fop) { \ - // iop and fop are token-pasted identifiers - // Same direct function call as lambda -} -``` - -**Result:** Identical code generation, but lambda provides more type information - -### 3. Register Allocation Benefits - -Lambda with `[&]` capture creates a closure object: - -**Captured by reference:** L, pc, base, k, ci -**Passed by value:** iop, fop, i - -**Advantage over macros:** -- Compiler has explicit capture list -- Can make better register allocation decisions -- Knows which variables are accessed vs modified -- Can avoid redundant loads from memory - -**Macro disadvantage:** -- All variables appear as "ambient" in scope -- Compiler must conservatively assume any could be modified -- May generate defensive loads/stores - -### 4. Code Layout and Instruction Cache - -**Lambda definitions (lvm.cpp:1378-1518):** -- 140 lines of lambda definitions BEFORE main loop -- These definitions compile to zero code (templates instantiated at call site) -- Main loop starts at same instruction address as before - -**Effect on i-cache:** -- No additional code in hot path -- Potentially better alignment of main loop -- Some iterations show lambda significantly faster (0.42s, 0.50s gains) -- This suggests better cache behavior in some system states - -### 5. Type Safety Benefits Compiler Optimizations - -Lambdas provide explicit type information: - -```cpp -auto op_arithI = [&](auto iop, auto fop, Instruction i) { - // Compiler knows: - // - iop is a function taking (lua_State*, lua_Integer, lua_Integer) → lua_Integer - // - fop is a function taking (lua_State*, lua_Number, lua_Number) → lua_Number - // - i is Instruction (uint32_t) -} -``` - -**Compiler can:** -- Eliminate impossible code paths -- Optimize based on function signatures -- Apply interprocedural optimizations -- Better dead code elimination - -**Macros provide less information:** -- Token substitution only -- No type checking until after expansion -- Compiler sees expanded code without context - -### 6. Variance Analysis - -**MACRO variance:** 3.91s - 4.86s (0.95s range, 21.6% variance) -**LAMBDA variance:** 3.94s - 4.81s (0.87s range, 19.7% variance) - -**Lambda has LOWER variance:** -- More consistent performance -- Fewer outliers -- Suggests more predictable execution pattern - ---- - -## Why Initial Measurements Showed "Regression" - -**Initial finding:** 4.49s average (claimed 7% regression) -**Side-by-side finding:** 4.305s average (2% IMPROVEMENT) - -**Reasons for discrepancy:** - -1. **System load variance:** Initial measurements not interleaved - - Macro version measured at different time - - Different system state (CPU thermal throttling, background tasks) - - Memory/cache state different - -2. **Statistical noise:** High variance (0.7-0.9s range) - - Individual measurements vary by 20% - - Need many samples to establish true average - - Interleaved measurement critical for accurate comparison - -3. **Confirmation bias:** Expected regression → measured regression - - Analysis predicted 30-50% chance of regression - - When variance showed higher times, interpreted as regression - - Side-by-side methodology eliminates this bias - ---- - -## Theoretical Performance Model - -### Why Lambdas DON'T Hurt Performance - -**Capture overhead:** ZERO -- `[&]` capture is compile-time construct -- No runtime closure allocation -- Captured variables are just references to outer scope -- Identical to macro's ambient scope access - -**Call overhead:** ZERO -- Lambdas inline completely at -O3 -- No function call overhead -- No vtable (not using function pointers) -- Direct code generation at call site - -**Parameter overhead:** ZERO -- `auto` parameters deduced at compile time -- Template instantiation creates specialized code -- Same as macro's token substitution -- No runtime polymorphism - -### Why Lambdas MIGHT Help Performance - -**Better alias analysis:** -- Explicit capture list → compiler knows what's accessed -- By-value parameters → no aliasing -- Compiler can reorder operations more aggressively - -**Reduced register pressure:** -- Compiler sees exact variable usage -- Can avoid saving/restoring unused variables -- Better register allocation in surrounding code - -**Instruction cache:** -- More consistent code layout -- Better alignment of hot loops -- Reduced branch mispredictions (compiler has more context) - ---- - -## Conclusion - -**The lambda conversion is a PERFORMANCE WIN:** - -✅ **2.1% faster** on average (4.305s vs 4.398s) -✅ **Lower variance** (more consistent performance) -✅ **Better code quality** (type safety, debuggability) -✅ **Zero cost abstraction** (validated experimentally) - -**Key insights:** - -1. **Modern C++ is NOT slower** - GCC 13.3.0 optimizes lambdas excellently -2. **Macros have NO performance advantage** in this use case -3. **Type information helps** compiler optimization -4. **Interleaved benchmarking is critical** for accurate measurements - -**Recommendations:** - -1. ✅ **Keep lambda version** - better performance + better code quality -2. ✅ **Update documentation** - lambda conversion is performance-positive -3. ✅ **Trust the compiler** - modern optimizers are excellent with lambdas -4. ❌ **Don't fear modern C++** - "zero-cost abstractions" are real - ---- - -**Measured by:** Claude (AI Assistant) -**Date:** 2025-11-17 -**Compiler:** GCC 13.3.0 -**Flags:** -O3 -std=c++23 -Werror -**Branch:** claude/analyze-lv-018LEz1SVgM57AT2HW11UTsi diff --git a/docs/lvm_analysis_suggestions.md b/docs/lvm_analysis_suggestions.md deleted file mode 100644 index 2b7c7bd9..00000000 --- a/docs/lvm_analysis_suggestions.md +++ /dev/null @@ -1,647 +0,0 @@ -# lvm.cpp Analysis & Improvement Suggestions - -**Date:** 2025-11-16 -**File:** `/home/user/lua_cpp/src/vm/lvm.cpp` -**Lines:** 2,133 -**Status:** Core VM interpreter - **PERFORMANCE CRITICAL HOT PATH** - ---- - -## Executive Summary - -`lvm.cpp` is the heart of the Lua VM - a register-based bytecode interpreter executing billions of instructions per second. The file is **already partially modernized** but has significant opportunities for improvement aligned with your C++23 conversion project. - -**Overall Assessment:** ⭐⭐⭐⭐ (4/5) -- ✅ Excellent architectural documentation (lines 1104-1138) -- ✅ Uses modern C++ features (InstructionView, reference accessors, operator overloads) -- ✅ Zero warnings, proper exception handling -- ⚠️ **30+ VM operation macros** (candidates for template/inline functions) -- ⚠️ **8 static helper functions** (should be lua_State methods per encapsulation goal) -- ⚠️ Mixing C-style patterns with modern C++ in critical paths - -**Key Metrics:** -- 83 opcodes in main interpreter loop (lines 1335-2086) -- 8 static helper functions (convertible to methods) -- ~30 VM operation macros (partially convertible) -- 45+ free functions (many already have lua_State method wrappers) - ---- - -## Priority 1: Static Functions → lua_State Methods -**Impact:** 🔥 HIGH | **Risk:** ✅ LOW | **Effort:** 4-6 hours - -### Problem -8 static helper functions operate on `lua_State*` but violate encapsulation principles: - -```cpp -// lvm.cpp lines 207-311, 548-570, 676-xxx, 842-857 -static int forlimit(lua_State *L, lua_Integer init, const TValue *lim, ...); -static int forprep(lua_State *L, StkId ra); -static int floatforloop(lua_State *L, StkId ra); -static int lessthanothers(lua_State *L, const TValue *l, const TValue *r); -static int lessequalothers(lua_State *L, const TValue *l, const TValue *r); -static void copy2buff(StkId top, int n, char *buff); -static void pushclosure(lua_State *L, Proto *p, UpVal **encup, StkId base, StkId ra); -static int l_strton(const TValue *obj, TValue *result); // Could be TValue method -``` - -### Solution - -**Phase 1A: For-loop helpers → lua_State methods** - -```cpp -// In lstate.h - Add to lua_State class -class lua_State { -private: - // For-loop operation helpers (VM-internal) - inline int forLimit(lua_Integer init, const TValue *lim, - lua_Integer *p, lua_Integer step) noexcept; - inline int forPrep(StkId ra) noexcept; - inline int floatForLoop(StkId ra) noexcept; - - // Comparison helpers - inline int lessThanOthers(const TValue *l, const TValue *r); - inline int lessEqualOthers(const TValue *l, const TValue *r); - - // Closure creation helper - inline void pushClosure(Proto *p, UpVal **encup, StkId base, StkId ra); - -public: - // ... existing public interface -}; -``` - -**Phase 1B: Update call sites in luaV_execute** - -```cpp -// Before (line 250): -if (forlimit(L, init, plimit, &limit, step)) - -// After: -if (L->forLimit(init, plimit, &limit, step)) -``` - -**Benefits:** -- ✅ Aligns with **100% encapsulation goal** (already achieved for 19 classes) -- ✅ Consistent with existing pattern: `L->execute()`, `L->concat()`, etc. -- ✅ Makes state dependencies explicit (no hidden L parameter) -- ✅ Zero performance impact (inline methods → same machine code) -- ✅ Better IntelliSense/IDE support - -**Note:** `l_strton` could become `TValue::tryConvertFromString()` method for even better encapsulation. - -**Estimated Effort:** 4-6 hours -**Performance Risk:** ⚠️ VERY LOW (inline expansion, no ABI change) - ---- - -## Priority 2: VM Operation Macros → Template Functions -**Impact:** 🔥 HIGH | **Risk:** ⚠️ MEDIUM | **Effort:** 12-16 hours - -### Problem -30+ function-like macros used in hot VM loop - poor type safety, hard to debug: - -```cpp -// Lines 935-1100 - Current macro-heavy approach -#define l_addi(L,a,b) intop(+, a, b) -#define l_subi(L,a,b) intop(-, a, b) -#define l_muli(L,a,b) intop(*, a, b) - -#define op_arithI(L,iop,fop) { \ - TValue *ra = vRA(i); \ - TValue *v1 = vRB(i); \ - int imm = InstructionView(i).sc(); \ - if (ttisinteger(v1)) { \ - lua_Integer iv1 = ivalue(v1); \ - pc++; setivalue(ra, iop(L, iv1, imm)); \ - } \ - else if (ttisfloat(v1)) { \ - lua_Number nb = fltvalue(v1); \ - lua_Number fimm = cast_num(imm); \ - pc++; setfltvalue(ra, fop(L, nb, fimm)); \ - }} -``` - -### Analysis - -**Convertible to inline/constexpr (LOW RISK):** - -```cpp -// Current (lines 935-940): -#define l_addi(L,a,b) intop(+, a, b) -#define l_band(a,b) intop(&, a, b) - -// Recommended - Already inline constexpr functions: -inline constexpr lua_Integer l_addi(lua_State*, lua_Integer a, lua_Integer b) noexcept { - return intop(+, a, b); -} -inline constexpr lua_Integer l_band(lua_Integer a, lua_Integer b) noexcept { - return intop(&, a, b); -} -``` - -**Note:** Functions l_lti, l_lei, l_gti, l_gei (lines 942-956) **already converted** to inline constexpr! ✅ - -**Complex macros - Consider lambda-based approach:** - -The `op_arith*` macros are challenging because they: -1. Access local variables from luaV_execute (i, pc, base, k) -2. Perform inline code generation -3. Are called 83+ times in the main loop - -**Recommended approach - Extract to inline helper methods:** - -```cpp -// In lua_State class (private section) -template -inline void arithmeticOp(Instruction i, StkId base, const Instruction*& pc, - IntOp&& iop, FloatOp&& fop) noexcept { - TValue *ra = s2v(base + InstructionView(i).a()); - TValue *v1 = s2v(base + InstructionView(i).b()); - TValue *v2 = s2v(base + InstructionView(i).c()); - - if (ttisinteger(v1) && ttisinteger(v2)) { - lua_Integer i1 = ivalue(v1); - lua_Integer i2 = ivalue(v2); - pc++; - setivalue(ra, iop(this, i1, i2)); - } - else { - lua_Number n1, n2; - if (tonumberns(v1, n1) && tonumberns(v2, n2)) { - pc++; - setfltvalue(ra, fop(this, n1, n2)); - } - } -} -``` - -**Usage in luaV_execute:** - -```cpp -// Before: -vmcase(OP_ADD) { - op_arith(L, l_addi, luai_numadd); - vmbreak; -} - -// After: -vmcase(OP_ADD) { - L->arithmeticOp(i, base, pc, l_addi, luai_numadd); - vmbreak; -} -``` - -### Benefits -- ✅ Type safety (compile-time errors vs runtime bugs) -- ✅ Debuggable (can step into functions, macros are opaque) -- ✅ Better error messages -- ✅ IntelliSense support - -### Risks -- ⚠️ **CRITICAL:** Must benchmark after conversion (target ≤4.24s) -- ⚠️ Potential register pressure if compiler doesn't optimize well -- ⚠️ Template instantiation code bloat (monitor binary size) - -### Recommendation -**Incremental approach:** -1. Convert simple arithmetic macros first (l_addi, l_band, etc.) - 2 hours -2. Benchmark thoroughly - 1 hour -3. If performance OK, tackle op_arith* family - 6 hours -4. Benchmark again - 1 hour -5. Convert remaining if performance acceptable - 2-4 hours - -**Estimated Total:** 12-16 hours -**Performance Risk:** ⚠️ MEDIUM (must verify with benchmarks) - ---- - -## Priority 3: Improve Code Organization -**Impact:** 🔶 MEDIUM | **Risk:** ✅ LOW | **Effort:** 6-8 hours - -### Problem -2,133-line monolithic file mixes helper functions, VM loop, and wrappers. - -### Solution - Extract helper functions into separate compilation unit - -**Current structure:** -``` -lvm.cpp: - - Conversion functions (l_strton, luaV_tonumber_, etc.) [lines 101-189] - - For-loop helpers [lines 207-311] - - Table access finishers [lines 330-423] - - Comparison helpers [lines 434-545, 548-673] - - Concatenation [lines 676-746] - - Arithmetic operations [lines 749-835] - - Closure creation [lines 842-857] - - ⭐ luaV_execute - MAIN VM LOOP [lines 1335-2086] ⭐ - - lua_State method wrappers [lines 2095-2132] -``` - -**Recommended refactoring:** - -``` -lvm.cpp: ← Core VM interpreter (keep this small & hot) - - luaV_execute() only - - Critical inline helpers (vmfetch, etc.) - - lua_State method wrappers - -lvm_helpers.cpp: ← NEW: Extract to separate TU - - Conversion functions - - Comparison helpers - - String concatenation - - Arithmetic helpers - -lvm_loops.cpp: ← NEW: For-loop specific code - - forprep, floatforloop, forlimit -``` - -**Benefits:** -- ✅ Faster compilation (parallel builds) -- ✅ Better code cache locality (smaller lvm.cpp = better instruction cache) -- ✅ Easier to understand and maintain -- ✅ Reduces cognitive load when working on VM loop - -**Estimated Effort:** 6-8 hours -**Performance Risk:** ✅ VERY LOW (no runtime changes) - ---- - -## Priority 4: Constexpr Opportunities -**Impact:** 🔶 MEDIUM | **Risk:** ✅ LOW | **Effort:** 2-3 hours - -### Opportunities - -**1. Integer/float comparison functions (lines 478-545)** -Current functions can't be constexpr due to runtime float→int conversion, but we can mark some helpers: - -```cpp -// Line 478 - Can't be constexpr (calls luaV_flttointeger at runtime) -int LTintfloat(lua_Integer i, lua_Number f) { ... } - -// But we could add compile-time fast paths: -template - requires (I >= -1'000'000 && I <= 1'000'000) // Fits in float exactly -inline constexpr bool LTintfloat_ct() { - return static_cast(I) < F; -} -``` - -**2. String comparison (lines 434-455)** -Already optimal (can't be constexpr due to strcoll). - -**3. MAXTAGLOOP constant (line 60)** -Already `#define` - could be `inline constexpr int`: - -```cpp -// Before: -#define MAXTAGLOOP 2000 - -// After: -inline constexpr int MAXTAGLOOP = 2000; -``` - -**Benefits:** -- ✅ Type safety -- ✅ Scoped to namespace (no macro pollution) -- ✅ Zero runtime cost - -**Estimated Effort:** 2-3 hours -**Performance Risk:** ✅ NONE - ---- - -## Priority 5: Modern C++ Patterns -**Impact:** 🔶 MEDIUM | **Risk:** ✅ LOW | **Effort:** 4-6 hours - -### Opportunities - -**1. Replace manual loops with std::algorithms (where appropriate)** - -```cpp -// Line 2050-2055 - Current: -for (; n > 0; n--) { - TValue *val = s2v(ra + n); - obj2arr(h, last - 1, val); - last--; - luaC_barrierback(L, obj2gco(h), val); -} - -// Consider (if performance acceptable): -// Note: Probably NOT worth it for such a small loop in hot path -// Keep as-is for now, but document the consideration -``` - -**Verdict:** Hot-path loops should stay manual for maximum control. ✅ Keep current approach. - -**2. String comparison - consider std::string_view?** - -```cpp -// Line 434 - l_strcmp uses C strings -int l_strcmp(const TString *ts1, const TString *ts2) { - const char *s1 = getlstr(ts1, rl1); - // ... -} -``` - -**Verdict:** Can't use std::string_view easily because Lua strings can contain embedded `\0`. ✅ Keep current approach. - -**3. Use std::span for buffer operations?** - -```cpp -// Line 676 - copy2buff -static void copy2buff(StkId top, int n, char *buff) { - size_t tl = 0; /* size already copied */ - do { - size_t l = strlen(svalue(s2v(top - n))); /* length of string being copied */ - memcpy(buff + tl, svalue(s2v(top - n)), l * sizeof(char)); - tl += l; - } while (--n > 0); -} -``` - -Could use `std::span` for buff parameter for better bounds checking in debug builds: - -```cpp -static void copy2buff(StkId top, int n, std::span buff) noexcept { - // ... implementation using buff.data(), buff.size() -} -``` - -**Benefits:** -- ✅ Better debug-mode bounds checking -- ✅ Self-documenting (size is part of type) - -**Risks:** -- ⚠️ Minimal - only affects function signature - -**Estimated Effort:** 1-2 hours -**Performance Risk:** ✅ NONE (zero-cost abstraction) - ---- - -## Priority 6: Documentation Improvements -**Impact:** 🔶 MEDIUM | **Risk:** ✅ NONE | **Effort:** 2-4 hours - -### Current State -**Excellent architectural documentation** (lines 1104-1138, 1140-1335) explaining: -- Register-based design -- Computed goto dispatch -- Hot-path optimization -- Protect macros -- Trap mechanism - -### Recommendations - -**1. Add complexity annotations for static analysis:** - -```cpp -// Add before luaV_execute: -/** - * Main VM interpreter loop - executes Lua bytecode instructions. - * - * PERFORMANCE CRITICAL: This function processes billions of instructions. - * Any changes MUST be benchmarked (target: ≤4.24s on all.lua test suite). - * - * Cyclomatic complexity: ~250 (83 opcodes × ~3 paths each) - * Cache characteristics: ~8KB code, ~2KB data (stack locals) - * Branch prediction: Critical - uses computed goto for 10-30% speedup - * - * @param L Lua state (contains stack, current CI, global state) - * @param ci CallInfo for function being executed - * - * @complexity O(n) where n = number of instructions executed - * @memory Stack frame: ~64-128 bytes (cl, k, base, pc, trap, i) - */ -void luaV_execute(lua_State *L, CallInfo *ci) { ... } -``` - -**2. Document hot vs cold paths:** - -```cpp -// Before each opcode group: -// HOT PATH OPCODES (>10% of execution time): -vmcase(OP_MOVE) { ... } -vmcase(OP_LOADI) { ... } -vmcase(OP_GETTABLE) { ... } -vmcase(OP_SETTABLE) { ... } -vmcase(OP_ADD) { ... } -vmcase(OP_CALL) { ... } - -// WARM PATH OPCODES (1-10% of execution time): -vmcase(OP_GETUPVAL) { ... } -... - -// COLD PATH OPCODES (<1% of execution time): -vmcase(OP_EXTRAARG) { ... } -``` - -**3. Add performance tips for future maintainers:** - -```cpp -/** - * PERFORMANCE TIPS FOR VM MAINTENANCE: - * - * 1. Keep local variables in registers: - * - pc, base, k are read 1000s of times per function - * - trap is checked every instruction - * - * 2. Order case labels by frequency: - * - OP_MOVE, OP_LOADI, OP_GETTABLE are most common - * - Helps branch predictor and code layout - * - * 3. Inline fast paths, call slow paths: - * - Table access: inline array access, call hash access - * - Arithmetic: inline integer ops, call metamethods - * - * 4. Minimize pc saves: - * - Only savepc() before operations that can throw - * - Protect() macro does this automatically - * - * 5. Benchmark methodology: - * - cd testes && for i in 1 2 3 4 5; do ../build/lua all.lua 2>&1 | grep "total time:"; done - * - Target: ≤4.24s (≤1% regression from 4.20s baseline) - * - * See CLAUDE.md for full benchmarking protocol. - */ -``` - -**Estimated Effort:** 2-4 hours -**Value:** Documentation improvements for future maintainability - ---- - -## Priority 7: Namespace Organization -**Impact:** 🔷 LOW | **Risk:** ✅ LOW | **Effort:** 3-4 hours - -### Current State -All VM functions are in global namespace with `luaV_` prefix (C-style). - -### Recommendation -Keep current approach for **C API compatibility**. The project maintains C API compatibility as a core requirement (CLAUDE.md lines 331-334). - -**Alternative:** Could use inline namespaces for internal organization: - -```cpp -namespace lua::vm::detail { - inline int l_strton(const TValue *obj, TValue *result); - // ... other internal helpers -} - -// C API wrappers stay in global namespace -using lua::vm::detail::l_strton; -``` - -**Verdict:** ❌ Not recommended - adds complexity without significant benefit. C API compatibility is more important. - ---- - -## Performance Benchmarking Protocol - -**CRITICAL:** Any changes to lvm.cpp MUST be benchmarked: - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done - -# Current baseline: 4.20s avg (Nov 16, 2025) -# Maximum acceptable: 4.24s (≤1% regression) -# Revert immediately if > 4.24s -``` - ---- - -## Summary of Recommendations - -| Priority | Improvement | Impact | Risk | Effort | Recommend? | -|----------|------------|--------|------|--------|-----------| -| 1 | Static functions → lua_State methods | 🔥 HIGH | ✅ LOW | 4-6h | ✅ **YES - Do First** | -| 2 | VM macros → inline/template functions | 🔥 HIGH | ⚠️ MEDIUM | 12-16h | ⚠️ **YES - Incremental** | -| 3 | Code organization (split files) | 🔶 MEDIUM | ✅ LOW | 6-8h | ✅ **YES - After P1** | -| 4 | Constexpr opportunities | 🔶 MEDIUM | ✅ LOW | 2-3h | ✅ **YES - Quick Win** | -| 5 | Modern C++ patterns (span, etc.) | 🔶 MEDIUM | ✅ LOW | 4-6h | 🤔 **MAYBE - Low Priority** | -| 6 | Documentation improvements | 🔶 MEDIUM | ✅ NONE | 2-4h | ✅ **YES - Helps Future** | -| 7 | Namespace organization | 🔷 LOW | ✅ LOW | 3-4h | ❌ **NO - Not Worth It** | - -**Total High-Priority Effort:** 22-31 hours -**Total All Recommended:** 30-43 hours - ---- - -## Recommended Implementation Order - -### **Phase 1: Foundation (6-9 hours)** -1. ✅ Convert simple macros to constexpr (Priority 4) - 2-3h -2. ✅ Move static functions to lua_State methods (Priority 1) - 4-6h -3. ✅ Benchmark - must be ≤4.24s - -### **Phase 2: Macro Conversion (13-17 hours)** -4. ⚠️ Convert arithmetic macros incrementally (Priority 2) - 12-16h -5. ⚠️ Benchmark after each batch - must be ≤4.24s -6. ⚠️ **REVERT** any batch that causes regression - -### **Phase 3: Code Quality (8-12 hours)** -7. ✅ Split lvm.cpp into focused files (Priority 3) - 6-8h -8. ✅ Add documentation improvements (Priority 6) - 2-4h -9. ✅ Final benchmark - -### **Phase 4: Polish (Optional, 4-6 hours)** -10. 🤔 Modern C++ patterns (Priority 5) - 4-6h -11. 🤔 Benchmark to ensure no regression - ---- - -## Code Quality Observations - -### **Strengths** ✅ -- **Excellent documentation** explaining design rationale -- **Modern exception handling** (C++ exceptions vs setjmp/longjmp) -- **Already uses InstructionView** for type-safe instruction decoding -- **Good use of inline hints** (l_likely, l_unlikely) for branch prediction -- **Comprehensive error handling** with proper stack unwinding - -### **Areas for Improvement** ⚠️ -- **Heavy macro usage** in hot paths (debugging difficulty) -- **Static functions** violate encapsulation principles -- **Monolithic file** (2133 lines - hard to navigate) -- **Mixed abstraction levels** (low-level macros + high-level methods) - -### **Potential Technical Debt** 💡 -- **No inline size monitoring** - could track `__attribute__((always_inline))` usage -- **No profile-guided optimization** - could use PGO for better code layout -- **No cache-line alignment** for critical structures - ---- - -## Additional Opportunities (Future Work) - -### **1. Profile-Guided Optimization (PGO)** -Could build with: -```bash -cmake -DCMAKE_BUILD_TYPE=Release -DLUA_ENABLE_PGO=ON -# Run workload to generate profile -cmake --build build --target pgo-use -``` - -Typical improvements: 5-15% speedup from better code layout and inlining decisions. - -### **2. Cache-Line Optimization** -Analyze struct layout of hot data: -```cpp -// Ensure hot fields are in same cache line -struct alignas(64) CallInfo { // 64-byte cache line - StkId func; // Offset 0 - StkId top; // Offset 8 - const Instruction* savedpc; // Offset 16 - // ... keep hot fields together -}; -``` - -### **3. SIMD Opportunities** -String operations (copy2buff, l_strcmp) could potentially use SIMD: -```cpp -#if defined(__SSE2__) -// Use _mm_loadu_si128 for bulk copying -#endif -``` - -**Verdict:** 🤔 Measure first - might not be worth complexity. - ---- - -## Conclusion - -**lvm.cpp is in good shape** but has clear opportunities for improvement that align perfectly with your C++23 modernization goals: - -1. **Quick Wins** (6-9 hours): - - Convert simple macros to constexpr ✅ - - Move static functions to methods ✅ - - Zero performance risk - -2. **High-Value Incremental** (12-16 hours): - - Convert VM operation macros carefully ⚠️ - - Benchmark after each step - - Revert if regression - -3. **Code Quality** (8-12 hours): - - Split into focused files ✅ - - Improve documentation ✅ - - Better maintainability - -**Total Recommended Effort:** 26-37 hours for substantial improvement with strict performance validation at each step. - -**Success Criteria:** -- ✅ All changes maintain ≤4.24s performance (≤1% regression) -- ✅ Better alignment with full encapsulation goal -- ✅ Improved debuggability and maintainability -- ✅ Preserved C API compatibility -- ✅ Zero new warnings with `-Werror` - ---- - -**Questions or need clarification on any recommendations? Ready to start with Phase 1?** diff --git a/docs/lvm_implementation_plan.md b/docs/lvm_implementation_plan.md deleted file mode 100644 index 3574c34f..00000000 --- a/docs/lvm_implementation_plan.md +++ /dev/null @@ -1,1213 +0,0 @@ -# lvm.cpp Modernization - Detailed Implementation Plan - -**Created:** 2025-11-16 -**Status:** Ready for implementation -**Total Estimated Time:** 30-43 hours (excluding P7) - ---- - -## Time Summary - -| Phase | Description | Estimated Time | Risk Level | -|-------|-------------|----------------|------------| -| **Phase 1** | Foundation (Priorities 1 & 4) | **6-9 hours** | ✅ LOW | -| **Phase 2** | Macro Conversion (Priority 2) | **12-16 hours** | ⚠️ MEDIUM | -| **Phase 3** | Code Quality (Priorities 3 & 6) | **8-12 hours** | ✅ LOW | -| **Phase 4** | Polish (Priority 5, optional) | **4-6 hours** | ✅ LOW | -| **TOTAL** | All recommended work | **30-43 hours** | - | - -**Conservative estimate:** 43 hours (~5-6 days of full-time work) -**Optimistic estimate:** 30 hours (~4 days of full-time work) -**Realistic with testing:** 38 hours (~5 days with thorough benchmarking) - ---- - -## Phase 1: Foundation (6-9 hours total) - -### Milestone 1.1: Convert Simple Macros to Constexpr (2-3 hours) -**Risk:** ✅ VERY LOW | **Dependencies:** None - -#### Step 1.1.1: Convert MAXTAGLOOP constant (15 min) -**File:** `src/vm/lvm.cpp` line 60 - -```cpp -// Before: -#define MAXTAGLOOP 2000 - -// After: -inline constexpr int MAXTAGLOOP = 2000; -``` - -**Tasks:** -- [ ] Edit lvm.cpp line 60 -- [ ] Build and verify no warnings -- [ ] Search for all usages, verify they compile -- [ ] Time: **15 minutes** - -#### Step 1.1.2: Convert arithmetic operator macros (30 min) -**File:** `src/vm/lvm.cpp` lines 935-940 - -Currently macros: `l_addi`, `l_subi`, `l_muli`, `l_band`, `l_bor`, `l_bxor` - -**Note:** `l_lti`, `l_lei`, `l_gti`, `l_gei` are already inline constexpr ✅ - -```cpp -// Before: -#define l_addi(L,a,b) intop(+, a, b) -#define l_subi(L,a,b) intop(-, a, b) -#define l_muli(L,a,b) intop(*, a, b) -#define l_band(a,b) intop(&, a, b) -#define l_bor(a,b) intop(|, a, b) -#define l_bxor(a,b) intop(^, a, b) - -// After: -inline constexpr lua_Integer l_addi(lua_State*, lua_Integer a, lua_Integer b) noexcept { - return intop(+, a, b); -} -inline constexpr lua_Integer l_subi(lua_State*, lua_Integer a, lua_Integer b) noexcept { - return intop(-, a, b); -} -inline constexpr lua_Integer l_muli(lua_State*, lua_Integer a, lua_Integer b) noexcept { - return intop(*, a, b); -} -inline constexpr lua_Integer l_band(lua_Integer a, lua_Integer b) noexcept { - return intop(&, a, b); -} -inline constexpr lua_Integer l_bor(lua_Integer a, lua_Integer b) noexcept { - return intop(|, a, b); -} -inline constexpr lua_Integer l_bxor(lua_Integer a, lua_Integer b) noexcept { - return intop(^, a, b); -} -``` - -**Tasks:** -- [ ] Convert each macro to inline constexpr -- [ ] Build and verify no warnings -- [ ] Time: **30 minutes** - -#### Step 1.1.3: First benchmark checkpoint (30 min) -**Critical:** Establish baseline before larger changes - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done -# Calculate average and verify ≤4.24s -``` - -**Tasks:** -- [ ] Clean build -- [ ] Run 5 benchmarks -- [ ] Calculate average -- [ ] Verify ≤4.24s (target: ~4.20s baseline) -- [ ] Document results -- [ ] Time: **30 minutes** - -#### Step 1.1.4: Commit Phase 1.1 (15 min) - -```bash -git add src/vm/lvm.cpp -git commit -m "Phase 1.1: Convert simple macros to constexpr - -- Convert MAXTAGLOOP to inline constexpr int -- Convert arithmetic macros (l_addi, l_subi, etc.) to inline constexpr functions -- Benchmark: X.XXs avg (baseline: 4.20s) - no regression ✅" -git push -u origin claude/analyze-lv-018LEz1SVgM57AT2HW11UTsi -``` - -**Tasks:** -- [ ] Git add, commit, push -- [ ] Time: **15 minutes** - -**Milestone 1.1 Total:** 1.5-2 hours - ---- - -### Milestone 1.2: Move Static Functions to lua_State Methods (4-6 hours) -**Risk:** ✅ LOW | **Dependencies:** None - -#### Step 1.2.1: Move for-loop helpers (2 hours) -**Files:** `src/vm/lvm.cpp` lines 207-311, `src/core/lstate.h` - -**Functions to convert:** -- `forlimit()` → `lua_State::forLimit()` -- `forprep()` → `lua_State::forPrep()` -- `floatforloop()` → `lua_State::floatForLoop()` - -**Tasks:** - -**A. Add method declarations to lstate.h (20 min)** - -```cpp -// In lua_State class, private section: -private: - // For-loop operation helpers (VM-internal) - inline int forLimit(lua_Integer init, const TValue *lim, - lua_Integer *p, lua_Integer step) noexcept; - inline int forPrep(StkId ra) noexcept; - inline int floatForLoop(StkId ra) noexcept; -``` - -- [ ] Edit lstate.h -- [ ] Add declarations to private section -- [ ] Build to verify syntax -- [ ] Time: **20 minutes** - -**B. Convert implementations in lvm.cpp (40 min)** - -```cpp -// Before: -static int forlimit(lua_State *L, lua_Integer init, const TValue *lim, - lua_Integer *p, lua_Integer step) { - // ... implementation -} - -// After: -int lua_State::forLimit(lua_Integer init, const TValue *lim, - lua_Integer *p, lua_Integer step) noexcept { - // Same implementation, but 'this' replaces 'L' - // Change: luaV_tointeger(lim, p, ...) stays same (uses this implicitly) - // Change: luaG_forerror(L, lim, "limit") → luaG_forerror(this, lim, "limit") -} -``` - -- [ ] Remove `static` keyword -- [ ] Change function signature to `lua_State::methodName` -- [ ] Replace `L` with `this` in function bodies -- [ ] Time: **40 minutes** (3 functions × ~13 min each) - -**C. Update call sites in luaV_execute (40 min)** - -Find all calls to these functions and update: - -```cpp -// Before (line 250): -if (forlimit(L, init, plimit, &limit, step)) - -// After: -if (L->forLimit(init, plimit, &limit, step)) - -// Before (line 240): -if (forprep(L, ra)) - -// After: -if (L->forPrep(ra)) - -// Before (line 299 - floatforloop): -else if (floatforloop(L, ra)) - -// After: -else if (L->floatForLoop(ra)) -``` - -**Search strategy:** -```bash -grep -n "forlimit(" src/vm/lvm.cpp -grep -n "forprep(" src/vm/lvm.cpp -grep -n "floatforloop(" src/vm/lvm.cpp -``` - -- [ ] Find all call sites (3 locations expected) -- [ ] Update to method call syntax -- [ ] Time: **40 minutes** - -**D. Build and test (20 min)** - -- [ ] cmake --build build -- [ ] cd testes && ../build/lua all.lua -- [ ] Verify "final OK !!!" -- [ ] Time: **20 minutes** - -**Step 1.2.1 Total:** 2 hours - -#### Step 1.2.2: Move comparison helpers (1 hour) -**Files:** `src/vm/lvm.cpp` lines 548-570, `src/core/lstate.h` - -**Functions to convert:** -- `lessthanothers()` → `lua_State::lessThanOthers()` -- `lessequalothers()` → `lua_State::lessEqualOthers()` - -**Tasks:** - -**A. Add method declarations (10 min)** - -```cpp -// In lua_State class, private section: -private: - inline int lessThanOthers(const TValue *l, const TValue *r); - inline int lessEqualOthers(const TValue *l, const TValue *r); -``` - -**B. Convert implementations (20 min)** -- [ ] Remove `static` keyword -- [ ] Change to member functions -- [ ] Replace `L` with `this` - -**C. Update call sites (20 min)** - -```bash -grep -n "lessthanothers" src/vm/lvm.cpp -grep -n "lessequalothers" src/vm/lvm.cpp -``` - -Expected locations: lines 1788, 1792 in `op_order` macro calls - -**D. Build and test (10 min)** - -**Step 1.2.2 Total:** 1 hour - -#### Step 1.2.3: Move pushclosure helper (1 hour) -**Files:** `src/vm/lvm.cpp` lines 842-857, `src/core/lstate.h` - -**Function:** `pushclosure()` → `lua_State::pushClosure()` - -**Tasks:** - -**A. Add method declaration (10 min)** - -```cpp -// In lua_State class, private section: -private: - inline void pushClosure(Proto *p, UpVal **encup, StkId base, StkId ra); -``` - -**B. Convert implementation (20 min)** -- [ ] Remove `static` keyword -- [ ] Change to member function -- [ ] Replace `L` with `this` - -**C. Update call site (20 min)** - -```bash -grep -n "pushclosure(" src/vm/lvm.cpp -``` - -Expected location: line 2061 in OP_CLOSURE handler - -```cpp -// Before: -halfProtect(pushclosure(L, p, cl->getUpvalPtr(0), base, ra)); - -// After: -halfProtect(L->pushClosure(p, cl->getUpvalPtr(0), base, ra)); -``` - -**D. Build and test (10 min)** - -**Step 1.2.3 Total:** 1 hour - -#### Step 1.2.4: Consider l_strton → TValue method (30 min - OPTIONAL) -**Files:** `src/vm/lvm.cpp` line 101, `src/objects/ltvalue.h` - -**Current:** -```cpp -static int l_strton(const TValue *obj, TValue *result); -``` - -**Could become:** -```cpp -// In TValue class: -inline int tryConvertFromString(TValue *result) const noexcept; -``` - -**Decision:** ⚠️ Skip for now - this function is only called from `luaV_tonumber_` and `luaV_tointeger`, which are already wrapped. Not worth the effort. - -**Time saved:** 30 minutes (reallocate to testing) - -#### Step 1.2.5: Benchmark checkpoint (30 min) - -Run full benchmark suite: - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done -``` - -**Tasks:** -- [ ] Clean build -- [ ] Run 5 benchmarks -- [ ] Calculate average -- [ ] Verify ≤4.24s -- [ ] Document results -- [ ] Time: **30 minutes** - -#### Step 1.2.6: Commit Phase 1.2 (15 min) - -```bash -git add src/vm/lvm.cpp src/core/lstate.h -git commit -m "Phase 1.2: Convert static functions to lua_State methods - -- forLimit(), forPrep(), floatForLoop() → lua_State methods -- lessThanOthers(), lessEqualOthers() → lua_State methods -- pushClosure() → lua_State method - -Improves encapsulation, aligns with project's 100% encapsulation goal. -All functions remain inline, zero performance impact. - -Benchmark: X.XXs avg (baseline: 4.20s) - no regression ✅" -git push -``` - -**Step 1.2.6 Total:** 15 minutes - -**Milestone 1.2 Total:** 4.5-5 hours - ---- - -**Phase 1 Total:** 6-9 hours - ---- - -## Phase 2: Macro Conversion (12-16 hours total) - -### ⚠️ CRITICAL: Incremental approach required - benchmark after each batch - -**Strategy:** Convert macros in small batches, benchmark after each, revert if regression - -### Milestone 2.1: Register/Constant Access Macros (2-3 hours) -**Risk:** ✅ LOW | **Files:** `src/vm/lvm.cpp` lines 1157-1165 - -**Current macros:** -```cpp -#define RA(i) (base+InstructionView(i).a()) -#define vRA(i) s2v(RA(i)) -#define RB(i) (base+InstructionView(i).b()) -#define vRB(i) s2v(RB(i)) -#define KB(i) (k+InstructionView(i).b()) -#define RC(i) (base+InstructionView(i).c()) -#define vRC(i) s2v(RC(i)) -#define KC(i) (k+InstructionView(i).c()) -#define RKC(i) ((InstructionView(i).testk()) ? k + InstructionView(i).c() : s2v(base + InstructionView(i).c())) -``` - -#### Step 2.1.1: Convert to inline functions (1.5 hours) - -**Decision:** These macros access local variables (base, k) from luaV_execute scope. We have two options: - -**Option A:** Keep as macros (RECOMMENDED) -- These are used 300+ times in hot loop -- Access local variables from outer scope -- Converting would require passing base, k to every call -- **Verdict:** Keep as macros for performance ✅ - -**Option B:** Convert to lambda (experimental) -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... setup base, k, pc, trap - - auto RA = [&](Instruction i) { return base + InstructionView(i).a(); }; - auto vRA = [&](Instruction i) { return s2v(RA(i)); }; - // ... etc -} -``` - -**Decision:** Skip this milestone - keep register access macros as-is -**Time saved:** 2-3 hours (reallocate to testing) - -**Milestone 2.1:** SKIPPED - ---- - -### Milestone 2.2: VM State Macros → Inline Functions (2 hours) -**Risk:** ✅ LOW | **Files:** `src/vm/lvm.cpp` lines 1169-1247 - -#### Step 2.2.1: Convert simple state macros (1 hour) - -**Current:** -```cpp -#define updatetrap(ci) (trap = ci->getTrap()) -#define updatebase(ci) (base = ci->funcRef().p + 1) -#define savepc(ci) ci->setSavedPC(pc) -``` - -**Decision:** These are actually fine as macros - they modify local variables and are extremely simple. Converting would make code more verbose without benefit. - -**Example of what it would look like:** -```cpp -// Before: -updatetrap(ci); - -// After: -trap = ci->getTrap(); // Just inline it manually if needed -``` - -**Verdict:** Keep as macros for clarity ✅ - -**Milestone 2.2:** SKIPPED - ---- - -### Milestone 2.3: Arithmetic Operation Macros (8-10 hours) -**Risk:** ⚠️ MEDIUM | **Files:** `src/vm/lvm.cpp` lines 963-1100 - -**This is the MAIN work of Phase 2** - -#### Step 2.3.1: Analysis and design (1 hour) - -**Current macro hierarchy:** -``` -op_arithI - Arithmetic with immediate operand -op_arithf - Float-only arithmetic -op_arithfK - Float arithmetic with constant -op_arith_aux - Helper for integer/float arithmetic -op_arith - Full arithmetic (register operands) -op_arithK - Full arithmetic (constant operand) -op_bitwiseK - Bitwise with constant -op_bitwise - Bitwise with registers -op_order - Comparison operations -op_orderI - Comparison with immediate -``` - -**Challenge:** These macros: -1. Access local variables from luaV_execute (i, pc, base, k, ci, L) -2. Modify `pc` inline (pc++) -3. Are used in 60+ locations in the VM loop - -**Proposed solution:** Create helper methods on lua_State that take all needed context - -```cpp -// In lua_State class (private): -template -inline void doArithmetic(Instruction i, StkId base, const TValue *k, - const Instruction*& pc, IntOp&& iop, FloatOp&& fop) noexcept { - TValue *v1 = s2v(base + InstructionView(i).b()); - TValue *v2 = s2v(base + InstructionView(i).c()); - if (ttisinteger(v1) && ttisinteger(v2)) { - StkId ra = base + InstructionView(i).a(); - lua_Integer i1 = ivalue(v1); - lua_Integer i2 = ivalue(v2); - pc++; - setivalue(s2v(ra), iop(this, i1, i2)); - } - else { - lua_Number n1, n2; - if (tonumberns(v1, n1) && tonumberns(v2, n2)) { - StkId ra = base + InstructionView(i).a(); - pc++; - setfltvalue(s2v(ra), fop(this, n1, n2)); - } - } -} -``` - -**Tasks:** -- [ ] Design helper method signatures -- [ ] Create test implementation for one macro -- [ ] Verify it compiles -- [ ] Time: **1 hour** - -#### Step 2.3.2: Convert op_arith family (Batch A) (2 hours) - -**Convert:** -- `op_arith_aux` → `lua_State::doArithmeticAux()` -- `op_arith` → `lua_State::doArithmetic()` -- `op_arithK` → `lua_State::doArithmeticK()` - -**Tasks:** -- [ ] Implement methods in lstate.h (inline) -- [ ] Update call sites in lvm.cpp -- [ ] Build and verify compilation -- [ ] Time: **2 hours** - -#### Step 2.3.3: Benchmark Batch A (30 min) - -**CRITICAL DECISION POINT** - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done -``` - -**Decision criteria:** -- ✅ If ≤4.24s: Continue to next batch -- ⚠️ If 4.24-4.30s: Investigate (may be noise, re-run 10 times) -- ❌ If >4.30s: **REVERT IMMEDIATELY** and keep macros - -#### Step 2.3.4: Convert op_arithI and op_arithf families (Batch B) (2 hours) -**Conditional on Batch A success** - -**Convert:** -- `op_arithI` → method -- `op_arithf` → method -- `op_arithfK` → method - -**Tasks:** -- [ ] Implement methods -- [ ] Update call sites -- [ ] Build -- [ ] Time: **2 hours** - -#### Step 2.3.5: Benchmark Batch B (30 min) - -Same decision criteria as Batch A - -#### Step 2.3.6: Convert op_bitwise family (Batch C) (1.5 hours) -**Conditional on Batch B success** - -**Convert:** -- `op_bitwise` → method -- `op_bitwiseK` → method - -#### Step 2.3.7: Benchmark Batch C (30 min) - -#### Step 2.3.8: Convert op_order family (Batch D) (1.5 hours) -**Conditional on Batch C success** - -**Convert:** -- `op_order` → method -- `op_orderI` → method - -#### Step 2.3.9: Final benchmark and commit (1 hour) - -**If all batches succeeded:** - -```bash -git add src/vm/lvm.cpp src/core/lstate.h -git commit -m "Phase 2.3: Convert arithmetic/bitwise/comparison macros to template methods - -Converted 10 macro families to type-safe template methods: -- op_arith, op_arithK, op_arith_aux, op_arithI, op_arithf, op_arithfK -- op_bitwise, op_bitwiseK -- op_order, op_orderI - -Benefits: -- Type safety (compile-time errors vs runtime bugs) -- Debuggable (can step into functions) -- Better error messages - -Benchmark: X.XXs avg (baseline: 4.20s) - regression: +X.XX% ✅" -git push -``` - -**If any batch failed:** -```bash -git reset --hard HEAD # Revert to last good commit -# Document why it failed in lvm_analysis_suggestions.md -``` - -**Milestone 2.3 Total:** 8-10 hours (conditional on success) - ---- - -**Phase 2 Total:** 8-10 hours (some milestones skipped, one major milestone conditional) - -**Realistic outcome:** -- Best case: 10 hours (all conversions successful) -- Likely case: 2-4 hours (convert what we can, keep some macros) -- Worst case: 1 hour (analysis only, keep all macros) - ---- - -## Phase 3: Code Quality (8-12 hours total) - -### Milestone 3.1: Split lvm.cpp into Focused Files (6-8 hours) -**Risk:** ✅ LOW | **Dependencies:** Phases 1-2 complete - -#### Step 3.1.1: Create lvm_helpers.cpp (2 hours) - -**Extract from lvm.cpp:** -- Conversion functions: `luaV_tonumber_`, `luaV_tointeger`, `luaV_tointegerns`, `luaV_flttointeger` (lines 118-172) -- TValue conversion methods: `TValue::toNumber()`, etc. (lines 178-189) -- Comparison helpers: `l_strcmp`, `LTintfloat`, `LEintfloat`, `LTfloatint`, `LEfloatint`, `LTnum`, `LEnum` (lines 434-545) -- Arithmetic operations: `luaV_idiv`, `luaV_mod`, `luaV_modf`, `luaV_shiftl` (lines 749-835) - -**Tasks:** - -**A. Create new file (30 min)** -```bash -touch src/vm/lvm_helpers.cpp -``` - -Add to CMakeLists.txt: -```cmake -# In lua_internal_sources: -src/vm/lvm.cpp -src/vm/lvm_helpers.cpp # NEW -``` - -**B. Move function implementations (1 hour)** -- [ ] Copy functions to lvm_helpers.cpp -- [ ] Remove from lvm.cpp -- [ ] Add necessary includes to lvm_helpers.cpp -- [ ] Add forward declarations to lvm.h if needed - -**C. Build and test (30 min)** -- [ ] cmake --build build -- [ ] Verify no linker errors -- [ ] Run test suite - -**Step 3.1.1 Total:** 2 hours - -#### Step 3.1.2: Create lvm_table.cpp for table operations (2 hours) - -**Extract from lvm.cpp:** -- Table access finishers: `luaV_finishget`, `luaV_finishset` (lines 330-423) - -**Similar tasks as 3.1.1** - -#### Step 3.1.3: Create lvm_string.cpp for string operations (1.5 hours) - -**Extract from lvm.cpp:** -- String concatenation: `copy2buff`, `luaV_concat` (lines 676-746) -- Object length: `luaV_objlen` (if present) - -#### Step 3.1.4: Update lvm.cpp to focused interpreter loop (30 min) - -**lvm.cpp should now contain only:** -- luaV_execute() - main VM loop -- luaV_finishOp() - opcode continuation -- Macro definitions needed by VM loop -- lua_State method wrappers - -#### Step 3.1.5: Verify and benchmark (30 min) - -```bash -cmake --build build --clean-first -cd testes && ../build/lua all.lua - -# Benchmark -for i in 1 2 3 4 5; do - ../build/lua all.lua 2>&1 | grep "total time:" -done -``` - -#### Step 3.1.6: Commit (15 min) - -```bash -git add CMakeLists.txt src/vm/lvm*.cpp src/vm/lvm.h -git commit -m "Phase 3.1: Split lvm.cpp into focused compilation units - -Created: -- lvm_helpers.cpp: Conversion and arithmetic helpers -- lvm_table.cpp: Table access finishers -- lvm_string.cpp: String operations - -lvm.cpp now contains only the core VM interpreter loop (luaV_execute). - -Benefits: -- Faster parallel compilation -- Better code organization -- Smaller primary hot-path file - -Before: 2,133 lines -After: lvm.cpp ~800 lines, helpers ~1,333 lines - -Benchmark: X.XXs avg (baseline: 4.20s) - no regression ✅" -git push -``` - -**Milestone 3.1 Total:** 6-8 hours - ---- - -### Milestone 3.2: Documentation Improvements (2-4 hours) -**Risk:** ✅ NONE | **Dependencies:** None (can be done anytime) - -#### Step 3.2.1: Add complexity annotations to luaV_execute (1 hour) - -**File:** `src/vm/lvm.cpp` line 1335 - -```cpp -/** - * Main VM interpreter loop - executes Lua bytecode instructions. - * - * PERFORMANCE CRITICAL: This function processes billions of instructions. - * Any changes MUST be benchmarked (target: ≤4.24s on all.lua test suite). - * - * Architecture: - * - Register-based VM (not stack-based) - * - Computed goto dispatch (10-30% faster than switch on GCC/Clang) - * - Hot-path inlining for common operations - * - Exception-based error handling - * - * Metrics: - * - Cyclomatic complexity: ~250 (83 opcodes × ~3 paths average) - * - Stack frame size: ~64-128 bytes (cl, k, base, pc, trap, i) - * - Typical instruction rate: 1-3 billion/second on modern CPUs - * - * Performance characteristics: - * - L1 instruction cache: Critical (keep loop < 32KB) - * - Branch prediction: Critical (computed goto helps) - * - Register pressure: High (keep base, pc, k in registers) - * - * @param L Lua state (contains stack, CallInfo chain, global state) - * @param ci CallInfo for the function being executed - * - * @complexity O(n) where n = number of instructions executed - * @memory Stack frame: 6-8 local variables (kept in registers) - * - * @see lvm.h for opcode definitions - * @see lopcodes.h for instruction format - */ -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... -} -``` - -**Tasks:** -- [ ] Add comprehensive documentation -- [ ] Time: **1 hour** - -#### Step 3.2.2: Document hot vs cold paths (1-2 hours) - -Add comments before opcode groups: - -```cpp -/** - * ============================================================================== - * HOT PATH OPCODES (>10% of total execution time) - * ============================================================================== - * These opcodes are executed most frequently in typical Lua code. - * Performance critical - any changes here must be benchmarked carefully. - */ - -vmcase(OP_MOVE) { ... } // ~15% - variable assignment -vmcase(OP_LOADI) { ... } // ~8% - integer constants -vmcase(OP_LOADK) { ... } // ~5% - constant loading -vmcase(OP_GETTABLE) { ... } // ~12% - table reads -vmcase(OP_SETTABLE) { ... } // ~8% - table writes -vmcase(OP_ADD) { ... } // ~6% - arithmetic -vmcase(OP_CALL) { ... } // ~10% - function calls -// ... etc - -/** - * ============================================================================== - * WARM PATH OPCODES (1-10% of total execution time) - * ============================================================================== - */ - -vmcase(OP_GETUPVAL) { ... } -// ... etc - -/** - * ============================================================================== - * COLD PATH OPCODES (<1% of total execution time) - * ============================================================================== - * Rarely executed - performance less critical. - */ - -vmcase(OP_EXTRAARG) { ... } -``` - -**Note:** Actual percentages would need profiling data. Use estimates for now. - -**Tasks:** -- [ ] Group opcodes by frequency (estimate) -- [ ] Add section comments -- [ ] Time: **1-2 hours** - -#### Step 3.2.3: Add performance tips section (1 hour) - -Add at top of file after includes: - -```cpp -/** - * ============================================================================== - * PERFORMANCE MAINTENANCE GUIDELINES - * ============================================================================== - * - * This file contains the Lua VM's main interpreter loop - the most performance- - * critical code in the entire project. Follow these guidelines when making changes: - * - * 1. ALWAYS BENCHMARK CHANGES - * cd /home/user/lua_cpp && cmake --build build --clean-first - * cd testes && for i in 1 2 3 4 5; do ../build/lua all.lua 2>&1 | grep "total time:"; done - * Target: ≤4.24s (≤1% regression from 4.20s baseline) - * REVERT IMMEDIATELY if >4.24s - * - * 2. KEEP LOCALS IN REGISTERS - * The main loop keeps these in CPU registers: - * - pc: Program counter (read every instruction) - * - base: Stack frame base (read every instruction) - * - k: Constants table (read for most opcodes) - * - trap: Hook flag (checked every instruction) - * - cl: Current closure (read occasionally) - * Adding more locals may cause register spilling = performance loss - * - * 3. INLINE FAST PATHS, CALL SLOW PATHS - * - Table array access: inline (common case) - * - Table hash access: call function (less common) - * - Integer arithmetic: inline (common case) - * - Metamethod calls: call function (rare) - * - * 4. MINIMIZE PC SAVES - * savepc(ci) writes back the program counter (needed for error handling) - * Only call before operations that might throw: - * - Use Protect() for operations that can error or GC - * - Use ProtectNT() for operations that only change trap - * - Use halfProtect() for operations that only error (no GC) - * - Don't save pc in hot paths that can't error - * - * 5. COMPUTED GOTO IS CRITICAL - * #if LUA_USE_JUMPTABLE enables computed goto (10-30% faster dispatch) - * GCC/Clang generate direct jump tables vs cascading if/else - * NEVER add code between vmcase labels and vmbreak - * - * 6. WATCH INSTRUCTION CACHE - * The main loop should stay under 32KB to fit in L1 instruction cache - * Current size: ~25KB (good) - * If adding complex opcodes, consider extracting to helper functions - * - * 7. PROFILE-GUIDED OPTIMIZATION - * For maximum performance, use PGO: - * cmake -DLUA_ENABLE_PGO=ON ... - * This optimizes code layout based on actual branch frequencies - * - * 8. TESTING REQUIREMENTS - * All changes must: - * - Build with zero warnings (-Werror) - * - Pass full test suite (cd testes && ../build/lua all.lua) - * - Meet performance target (≤4.24s) - * - Be benchmarked 5+ times (check for variance) - * - * See CLAUDE.md for complete development guidelines. - */ -``` - -**Tasks:** -- [ ] Add performance guidelines -- [ ] Time: **1 hour** - -#### Step 3.2.4: Commit documentation (15 min) - -```bash -git add src/vm/lvm.cpp -git commit -m "Phase 3.2: Add comprehensive VM documentation - -- Function-level complexity annotations -- Hot/warm/cold path categorization -- Performance maintenance guidelines -- Detailed comments on critical optimizations - -Helps future maintainers understand performance-critical code." -git push -``` - -**Milestone 3.2 Total:** 2-4 hours - ---- - -**Phase 3 Total:** 8-12 hours - ---- - -## Phase 4: Modern C++ Polish (Optional, 4-6 hours) - -### Milestone 4.1: Use std::span for Buffer Operations (2-3 hours) -**Risk:** ✅ LOW | **Dependencies:** None - -#### Step 4.1.1: Convert copy2buff to use std::span (1.5 hours) - -**Current (line 676):** -```cpp -static void copy2buff(StkId top, int n, char *buff) { - size_t tl = 0; - do { - size_t l = strlen(svalue(s2v(top - n))); - memcpy(buff + tl, svalue(s2v(top - n)), l * sizeof(char)); - tl += l; - } while (--n > 0); -} -``` - -**After:** -```cpp -static void copy2buff(StkId top, int n, std::span buff) noexcept { - size_t tl = 0; - do { - size_t l = strlen(svalue(s2v(top - n))); - lua_assert(tl + l <= buff.size()); // Debug-mode bounds check - memcpy(buff.data() + tl, svalue(s2v(top - n)), l * sizeof(char)); - tl += l; - } while (--n > 0); -} -``` - -**Update call sites:** -```bash -grep -n "copy2buff" src/vm/lvm.cpp -``` - -**Tasks:** -- [ ] Add `#include ` to lvm.cpp -- [ ] Convert function signature -- [ ] Update call sites to pass std::span -- [ ] Build and test -- [ ] Time: **1.5 hours** - -#### Step 4.1.2: Consider other span opportunities (30 min) - -Search for other `char*` buffer operations that could benefit. - -#### Step 4.1.3: Benchmark and commit (1 hour) - -```bash -git add src/vm/lvm.cpp -git commit -m "Phase 4.1: Use std::span for buffer operations - -- copy2buff() now takes std::span for better type safety -- Debug-mode bounds checking via assertions -- Zero runtime overhead (span is zero-cost abstraction) - -Benchmark: X.XXs avg (baseline: 4.20s) - no regression ✅" -git push -``` - -**Milestone 4.1 Total:** 2-3 hours - ---- - -### Milestone 4.2: Other Modern C++ Opportunities (2-3 hours) -**Risk:** ✅ LOW - -#### Step 4.2.1: Use designated initializers where appropriate (1 hour) - -Look for struct initialization that could be clearer with C++20 designated initializers. - -**Example:** -```cpp -// If we find code like: -TValue v; -v.value_.i = 42; -v.tt_ = LUA_VNUMINT; - -// Could become: -TValue v{ - .value_ = {.i = 42}, - .tt_ = LUA_VNUMINT -}; -``` - -**Tasks:** -- [ ] Search for opportunities -- [ ] Convert if found -- [ ] Time: **1 hour** - -#### Step 4.2.2: Use [[likely]]/[[unlikely]] attributes (1-2 hours) - -**Current:** -```cpp -if (l_unlikely(trap)) { ... } -if (l_likely(ttisinteger(o))) { ... } -``` - -**C++20 alternative:** -```cpp -if (trap) [[unlikely]] { ... } -if (ttisinteger(o)) [[likely]] { ... } -``` - -**Decision:** Keep current approach - `l_likely`/`l_unlikely` are more portable and work pre-C++20. Not worth changing. - -**Tasks:** -- [ ] Document decision not to change -- [ ] Time: **0 hours** - -#### Step 4.2.3: Commit if any changes made (15 min) - -**Milestone 4.2 Total:** 1-2 hours - ---- - -**Phase 4 Total:** 3-5 hours (mostly optional) - ---- - -## Final Verification and Documentation (2 hours) - -### Step F.1: Full benchmark suite (1 hour) - -Run comprehensive benchmarks: - -```bash -cd /home/user/lua_cpp -cmake --build build --clean-first - -cd testes -echo "Running 10 benchmark iterations..." -for i in {1..10}; do - ../build/lua all.lua 2>&1 | grep "total time:" -done - -# Calculate statistics: -# - Average -# - Min/Max -# - Standard deviation -``` - -### Step F.2: Update project documentation (30 min) - -**Update CLAUDE.md:** -```markdown -## Recent Major Achievements - -**lvm.cpp Modernization** - Completed Nov 16, 2025: - -- **Static Functions → Methods** ✅ - - Converted 5 static helpers to lua_State methods - - Better encapsulation, zero performance impact - - Performance: X.XXs avg (baseline: 4.20s) - -- **Macro Conversion** ✅/⚠️ - - Converted 6 simple macros to inline constexpr - - [If succeeded] Converted 10 operation macros to templates - - [If failed] Kept operation macros for performance - - Performance: X.XXs avg - -- **Code Organization** ✅ - - Split 2,133-line lvm.cpp into focused files - - lvm.cpp now ~800 lines (core interpreter only) - - Faster compilation, better maintainability - -- **Documentation** ✅ - - Added comprehensive performance guidelines - - Hot/warm/cold path categorization - - Complexity annotations -``` - -### Step F.3: Create summary report (30 min) - -Create `lvm_modernization_report.md` with: -- What was accomplished -- What was skipped and why -- Performance results -- Lessons learned -- Future opportunities - -### Step F.4: Final commit (15 min) - -```bash -git add CLAUDE.md lvm_modernization_report.md -git commit -m "Complete lvm.cpp modernization project - -Summary of changes: -- Converted 5 static functions to lua_State methods -- Converted 6 simple macros to inline constexpr -- [If done] Converted 10 operation macros to templates -- Split lvm.cpp into 4 focused compilation units -- Added comprehensive documentation - -Total effort: X hours over Y days -Performance: X.XXs avg (baseline: 4.20s) - X.XX% change - -See lvm_modernization_report.md for detailed results." -git push -``` - ---- - -## Total Time Summary - -| Phase | Hours (Best) | Hours (Realistic) | Hours (Conservative) | -|-------|--------------|-------------------|---------------------| -| **Phase 1: Foundation** | 6 | 7.5 | 9 | -| **Phase 2: Macros** | 2 | 6 | 10 | -| **Phase 3: Code Quality** | 8 | 10 | 12 | -| **Phase 4: Polish** | 3 | 4 | 5 | -| **Final Verification** | 2 | 2 | 2 | -| **TOTAL** | **21** | **29.5** | **38** | - -**Realistic estimate considering:** -- Testing time between phases -- Debugging unexpected issues -- Time spent understanding code -- Decision-making time -- Benchmark variance investigation - -**Expected timeline:** -- **Full-time work:** 4-5 days (8 hours/day) -- **Part-time work:** 7-10 days (4 hours/day) -- **Relaxed pace:** 2-3 weeks (2 hours/day) - ---- - -## Risk Mitigation Strategies - -### If benchmarks show regression: - -**Phase 1 issues (unlikely):** -- Revert to baseline -- Check compiler optimization flags -- Verify inlining is happening - -**Phase 2 issues (possible):** -- Revert batch that caused regression -- Keep macros for those operations -- Document why conversion wasn't viable - -**Phase 3 issues (very unlikely):** -- Check that functions weren't accidentally de-inlined -- Verify linker optimization settings -- Consider LTO (Link Time Optimization) - -### If compilation issues: - -- Check header include order -- Verify forward declarations -- Check for circular dependencies -- Ensure all template instantiations are available - -### If test failures: - -- Check for incorrect `this` vs `L` conversions -- Verify all call sites were updated -- Check for macro expansion changes - ---- - -## Success Criteria - -✅ **Must Have:** -- [ ] All tests pass (cd testes && ../build/lua all.lua → "final OK !!!") -- [ ] Performance ≤4.24s (≤1% regression from 4.20s baseline) -- [ ] Zero compiler warnings with -Werror -- [ ] All changes committed and pushed - -✅ **Should Have:** -- [ ] At least Phase 1 complete (static functions → methods) -- [ ] Code organization improved (split files) -- [ ] Documentation added - -🎯 **Nice to Have:** -- [ ] Operation macros converted (Phase 2) -- [ ] Modern C++ features added (Phase 4) -- [ ] Performance improved vs baseline - ---- - -## Next Steps - -Ready to start? Recommended approach: - -1. **Read this plan thoroughly** (30 min) -2. **Start with Phase 1.1** (simple macro conversion) -3. **Benchmark after Phase 1** to establish process -4. **Continue incrementally** with testing at each step -5. **Stop if any phase shows regression** - document and move on - -**First command to run:** -```bash -cd /home/user/lua_cpp -# Back up current state -git branch backup-before-lvm-modernization - -# Start Phase 1.1.1 -# Edit src/vm/lvm.cpp line 60... -``` - -Good luck! 🚀 diff --git a/docs/lvm_remaining_macros.md b/docs/lvm_remaining_macros.md deleted file mode 100644 index 74e074db..00000000 --- a/docs/lvm_remaining_macros.md +++ /dev/null @@ -1,285 +0,0 @@ -# Remaining Macros in lvm.cpp - -**Date:** 2025-11-17 -**After Lambda Conversion:** Phase 2 complete - ---- - -## Summary - -After converting VM operation macros to lambdas, **36 macros remain** in lvm.cpp, categorized as follows: - ---- - -## Category 1: Configuration Macros (3) - -**Purpose:** Compile-time configuration -**Status:** ✅ **Keep as-is** (appropriate macro usage) - -| Line | Macro | Purpose | -|------|-------|---------| -| 7 | `lvm_c` | Include guard | -| 8 | `LUA_CORE` | Marks this as core Lua code | -| 47-49 | `LUA_USE_JUMPTABLE` | Conditional compilation for computed goto | - -**Rationale:** These are configuration/include guard macros - appropriate use case for macros. - ---- - -## Category 2: Mathematical Constants (4) - -**Purpose:** Compile-time mathematical calculations -**Status:** ✅ **Keep as-is** (could convert to constexpr, but low priority) - -| Line | Macro | Purpose | -|------|-------|---------| -| 69 | `NBM` | Number of bits in mantissa | -| 82 | `MAXINTFITSF` | Max integer that fits in float | -| 85 | `l_intfitsf(i)` | Check if integer fits in float | -| 89 | `l_intfitsf(i)` | Alternative definition (always true) | -| 832 | `NBITS` | Number of bits in lua_Integer | - -**Conversion potential:** ⚠️ Could convert to `inline constexpr` (low priority) - -**Example:** -```cpp -// Current: -#define NBITS l_numbits(lua_Integer) - -// Could become: -inline constexpr int NBITS = l_numbits(lua_Integer); -``` - ---- - -## Category 3: VM Operation Macros - ORIGINAL DEFINITIONS (11) - -**Purpose:** Original macro definitions (still in file, but unused in luaV_execute) -**Status:** ⚠️ **Superseded by lambdas** (can be removed if not used elsewhere) - -| Line | Macro | Converted to Lambda? | -|------|-------|---------------------| -| 991 | `op_arithI(L,iop,fop)` | ✅ YES (lambda in luaV_execute) | -| 1010 | `op_arithf_aux(L,v1,v2,fop)` | ✅ YES | -| 1021 | `op_arithf(L,fop)` | ✅ YES | -| 1030 | `op_arithfK(L,fop)` | ✅ YES | -| 1039 | `op_arith_aux(L,v1,v2,iop,fop)` | ✅ YES | -| 1051 | `op_arith(L,iop,fop)` | ✅ YES | -| 1060 | `op_arithK(L,iop,fop)` | ✅ YES | -| 1069 | `op_bitwiseK(L,op)` | ✅ YES | -| 1083 | `op_bitwise(L,op)` | ✅ YES | -| 1097 | `op_order(L,op,other)` | ✅ YES | -| 1112 | `op_orderI(L,opi,opf,inv,tm)` | ✅ YES | - -**Important:** These macros are: -1. Defined globally (lines 991-1127) -2. #undef'd inside `luaV_execute` (line 1378-1389) -3. Replaced by lambdas inside `luaV_execute` (lines 1391-1514) - -**Cleanup opportunity:** ✅ Can remove original definitions if not used elsewhere - ---- - -## Category 4: Register Access Macros (9) - -**Purpose:** Access VM registers and constants -**Status:** ✅ **Keep as-is** (critical for VM performance, used billions of times) - -| Line | Macro | Purpose | -|------|-------|---------| -| 1185 | `RA(i)` | Access register A | -| 1186 | `vRA(i)` | Access value in register A | -| 1187 | `RB(i)` | Access register B | -| 1188 | `vRB(i)` | Access value in register B | -| 1189 | `KB(i)` | Access constant B | -| 1190 | `RC(i)` | Access register C | -| 1191 | `vRC(i)` | Access value in register C | -| 1192 | `KC(i)` | Access constant C | -| 1193 | `RKC(i)` | Access register or constant C (conditional) | - -**Rationale:** -- Ultra-hot path (billions of executions) -- Simple expressions that inline perfectly -- No type safety benefit from conversion -- Used inside lambdas (lambdas depend on these macros) - -**Conversion potential:** ❌ Not recommended (would hurt readability, no benefit) - ---- - -## Category 5: VM State Management Macros (5) - -**Purpose:** Manage VM execution state (trap, base, pc) -**Status:** ✅ **Keep as-is** (critical VM infrastructure) - -| Line | Macro | Purpose | -|------|-------|---------| -| 1197 | `updatetrap(ci)` | Update trap flag from CallInfo | -| 1199 | `updatebase(ci)` | Update base pointer from CallInfo | -| 1202 | `updatestack(ci)` | Update stack (calls updatebase if trap set) | -| 1230 | `savepc(ci)` | Save program counter to CallInfo | -| 1244 | `savestate(L,ci)` | Save both pc and top | - -**Rationale:** -- Simple state synchronization operations -- Used frequently in error handling paths -- No type safety benefit from conversion - ---- - -## Category 6: Control Flow Macros (3) - -**Purpose:** VM control flow operations -**Status:** ✅ **Keep as-is** (deeply integrated with VM dispatch) - -| Line | Macro | Purpose | -|------|-------|---------| -| 1210 | `dojump(ci,i,e)` | Execute jump instruction | -| 1214 | `donextjump(ci)` | Execute following jump (test instructions) | -| 1221 | `docondjump()` | Conditional jump (used in lambdas!) | - -**Note:** `docondjump()` is used INSIDE the `op_order` and `op_orderI` lambdas! - ---- - -## Category 7: Exception/Error Handling Macros (4) - -**Purpose:** Save state before operations that can throw -**Status:** ✅ **Keep as-is** (critical for error handling) - -| Line | Macro | Purpose | -|------|-------|---------| -| 1263 | `Protect(exp)` | Full protection (saves state + updates trap) | -| 1266 | `ProtectNT(exp)` | Protect without updating top | -| 1275 | `halfProtect(exp)` | Save state only (no trap update) | -| 1300 | `checkGC(L,c)` | GC check + yield point | - -**Rationale:** -- Used extensively for exception safety -- Compose other macros (savestate, updatetrap) -- No type safety benefit from conversion -- Critical for VM correctness - ---- - -## Category 8: VM Dispatch Macros (4) - -**Purpose:** VM instruction dispatch mechanism -**Status:** ✅ **Keep as-is** (core VM infrastructure) - -| Line | Macro | Purpose | -|------|-------|---------| -| 1282 | `luai_threadyield(L)` | Thread yield point | -| 1326 | `vmfetch()` | Fetch next instruction | -| 1334 | `vmdispatch(o)` | Dispatch switch | -| 1335 | `vmcase(l)` | Case label | -| 1336 | `vmbreak` | Break from case | - -**Rationale:** -- Could use computed goto with `LUA_USE_JUMPTABLE` -- Switch dispatch macros provide flexibility -- Converting would hurt readability - ---- - -## Category 9: String Conversion Macro (1) - -**Purpose:** String coercion helper -**Status:** ⚠️ **Could convert to inline function** (medium priority) - -| Line | Macro | Purpose | -|------|-------|---------| -| 680 | `tostring(L,o)` | Convert value to string with coercion | - -**Conversion potential:** ⚠️ Could become inline function - ---- - -## Summary Table - -| Category | Count | Should Convert? | Priority | -|----------|-------|-----------------|----------| -| Configuration | 3 | ❌ No | N/A | -| Math constants | 4 | ⚠️ Optional | Low | -| VM operations (original defs) | 11 | ✅ **Remove** | **High** | -| Register access | 9 | ❌ No | N/A | -| State management | 5 | ❌ No | N/A | -| Control flow | 3 | ❌ No | N/A | -| Exception handling | 4 | ❌ No | N/A | -| VM dispatch | 4 | ❌ No | N/A | -| String conversion | 1 | ⚠️ Optional | Medium | -| **TOTAL** | **36** | **11 removable** | - | - ---- - -## Recommendations - -### High Priority: Remove Unused VM Operation Macros - -The 11 original VM operation macro definitions (lines 991-1127) are **superseded by lambdas** and can be safely removed: - -```cpp -// These can be DELETED (lines 991-1127): -#define op_arithI(L,iop,fop) { ... } -#define op_arithf_aux(L,v1,v2,fop) { ... } -#define op_arithf(L,fop) { ... } -#define op_arithfK(L,fop) { ... } -#define op_arith_aux(L,v1,v2,iop,fop) { ... } -#define op_arith(L,iop,fop) { ... } -#define op_arithK(L,iop,fop) { ... } -#define op_bitwiseK(L,op) { ... } -#define op_bitwise(L,op) { ... } -#define op_order(L,op,other) { ... } -#define op_orderI(L,opi,opf,inv,tm) { ... } -``` - -**Why:** They are #undef'd and replaced inside luaV_execute, so the global definitions serve no purpose. - -**Benefit:** -- Cleaner code (137 lines removed) -- No accidental usage outside luaV_execute -- Makes lambda conversion more obvious - -### Medium Priority: Convert Math Constants - -Convert mathematical constant macros to `inline constexpr`: - -```cpp -// Instead of: -#define NBITS l_numbits(lua_Integer) - -// Use: -inline constexpr int NBITS = l_numbits(lua_Integer); -``` - -**Benefit:** Type safety, no performance impact - -### Low Priority: Keep Everything Else - -The remaining 24 macros are **appropriate macro usage**: -- Configuration (3) -- Register access (9) - ultra-hot path -- State management (5) - simple operations -- Control flow (3) - VM dispatch -- Exception handling (4) - composing other macros -- VM dispatch (4) - core infrastructure -- String conversion (1) - low priority - ---- - -## Conclusion - -**36 macros remain, 11 can be removed immediately.** - -After cleanup: -- ✅ **25 macros** will remain (all appropriate) -- ✅ **11 VM operation macros** replaced by lambdas -- ✅ Clean separation between "good macros" and converted lambdas - -**Next step:** Remove lines 991-1127 (original VM operation macro definitions) - ---- - -**Analysis by:** Claude (AI Assistant) -**Date:** 2025-11-17 -**Branch:** claude/analyze-lv-018LEz1SVgM57AT2HW11UTsi diff --git a/docs/lvm_updated_analysis_2025-11-17.md b/docs/lvm_updated_analysis_2025-11-17.md deleted file mode 100644 index 147d74d9..00000000 --- a/docs/lvm_updated_analysis_2025-11-17.md +++ /dev/null @@ -1,434 +0,0 @@ -# lvm.cpp Analysis and Improvements - November 17, 2025 - -## Executive Summary - -Successfully modernized and modularized lvm.cpp (Lua VM bytecode interpreter) through two phases: -1. **Quick Wins**: Removed static wrapper functions, simplified control flow -2. **File Splitting**: Split monolithic 2,248-line file into 7 focused modules - -**Performance**: 4.39s avg (baseline 4.20s) - acceptable variance for structural changes -**Result**: Improved maintainability with negligible performance impact - ---- - -## Phase 1: Quick Wins ✅ - -### Changes Made - -**Removed Static Wrapper Functions**: -- Eliminated `lessthanothers()` static wrapper (forwarded to lua_State::lessThanOthers) -- Eliminated `lessequalothers()` static wrapper (forwarded to lua_State::lessEqualOthers) - -**Simplified op_order Lambda**: -Updated to call lua_State methods directly via lambda wrappers: - -```cpp -// Added in luaV_execute: -auto other_lt = [&](lua_State* L_arg, const TValue* l, const TValue* r) { - return L_arg->lessThanOthers(l, r); -}; -auto other_le = [&](lua_State* L_arg, const TValue* l, const TValue* r) { - return L_arg->lessEqualOthers(l, r); -}; - -// Updated opcodes: -vmcase(OP_LT) { - op_order(cmp_lt, other_lt, i); - vmbreak; -} -vmcase(OP_LE) { - op_order(cmp_le, other_le, i); - vmbreak; -} -``` - -**Benefits**: -- ✅ Reduced indirection layers -- ✅ Cleaner code organization -- ✅ Better encapsulation (aligns with 100% encapsulation goal) - ---- - -## Phase 2: File Splitting ✅ - -### Motivation - -The original lvm.cpp was 2,248 lines containing: -- VM bytecode interpreter (luaV_execute - the hot path) -- Type conversion utilities -- Comparison operations -- String operations -- Table operations -- Arithmetic operations -- For-loop utilities - -This violated Single Responsibility Principle and made the file difficult to navigate. - -### New Module Structure - -Split into 7 focused files: - -#### 1. **lvm.cpp** (core interpreter, ~1,000 lines) -- `luaV_execute()` - Main bytecode dispatch loop -- Core VM operations -- Contains the HOT PATH - most performance-critical code - -#### 2. **lvm_conversion.cpp** (117 lines) -Type conversion operations: -- `l_strton()` - String to number conversion -- `luaV_tonumber_()` - Value to number conversion -- `luaV_flttointeger()` - Float to integer rounding -- `luaV_tointegerns()` - Value to integer (no string coercion) -- `luaV_tointeger()` - Value to integer (with string coercion) -- `TValue::toNumber/toInteger/toIntegerNoString()` - Method wrappers - -#### 3. **lvm_comparison.cpp** (262 lines) -Comparison and equality operations: -- `l_strcmp()` - Locale-aware string comparison with \0 handling -- `LTintfloat/LEintfloat()` - Integer vs float comparisons -- `LTfloatint/LEfloatint()` - Float vs integer comparisons -- `lua_State::lessThanOthers/lessEqualOthers()` - Non-numeric comparisons -- `luaV_lessthan/luaV_lessequal()` - Main comparison operations -- `luaV_equalobj()` - Equality with metamethod support - -#### 4. **lvm_string.cpp** (145 lines) -String concatenation and length: -- `tostring()` - Ensure value is string (with coercion) -- `isemptystr()` - Check if string is empty -- `copy2buff()` - Copy stack strings to buffer -- `luaV_concat()` - Main concatenation operation -- `luaV_objlen()` - Length operator (#) implementation - -#### 5. **lvm_table.cpp** (107 lines) -Table access finishers with metamethods: -- `luaV_finishget()` - Complete table get with __index metamethod -- `luaV_finishset()` - Complete table set with __newindex metamethod - -#### 6. **lvm_arithmetic.cpp** (94 lines) -Arithmetic operations: -- `luaV_idiv()` - Integer division (floor division) -- `luaV_mod()` - Integer modulus -- `luaV_modf()` - Float modulus -- `luaV_shiftl()` - Bitwise shift left -- `NBITS` - Number of bits in lua_Integer - -#### 7. **lvm_loops.cpp** (145 lines) -For-loop operations: -- `lua_State::forLimit()` - Compute for-loop limit -- `lua_State::forPrep()` - Prepare numeric for-loop -- `lua_State::floatForLoop()` - Float for-loop iteration - -### Header Changes - -**lvm.h** modifications: -- Added `#include ` for DBL_MANT_DIG macro -- Added `#include "lgc.h"` for luaC_barrierback -- Moved `l_intfitsf()` utility from lvm.cpp (needed by lvm_comparison.cpp): - ```cpp - #define NBM (l_floatatt(MANT_DIG)) - - #if ((((LUA_MAXINTEGER >> (NBM / 4)) >> (NBM / 4)) >> (NBM / 4)) \ - >> (NBM - (3 * (NBM / 4)))) > 0 - inline constexpr lua_Unsigned MAXINTFITSF = (static_cast(1) << NBM); - inline constexpr bool l_intfitsf(lua_Integer i) noexcept { - return (MAXINTFITSF + l_castS2U(i)) <= (2 * MAXINTFITSF); - } - #else - inline constexpr bool l_intfitsf(lua_Integer i) noexcept { - (void)i; - return true; - } - #endif - ``` - -**lstate.h** modifications: -- Changed for-loop and comparison method declarations from `inline int` to `int` - (implementations are in separate .cpp files, not inlined in header) - -**CMakeLists.txt** modifications: -```cmake -set(LUA_VM_SOURCES - src/vm/lvm.cpp - src/vm/lvm_arithmetic.cpp - src/vm/lvm_comparison.cpp - src/vm/lvm_conversion.cpp - src/vm/lvm_loops.cpp - src/vm/lvm_string.cpp - src/vm/lvm_table.cpp -) -``` - ---- - -## Benefits - -### Maintainability -- ✅ **Clear separation of concerns**: Each file has single, focused responsibility -- ✅ **Easier navigation**: Jump directly to relevant module (comparison, string, arithmetic, etc.) -- ✅ **Reduced cognitive load**: Smaller files are easier to understand -- ✅ **Better code organization**: Related functions grouped together - -### Build System -- ✅ **Parallel compilation**: 7 smaller files can compile in parallel vs 1 large file -- ✅ **Incremental builds**: Changes to one module don't require recompiling entire VM -- ✅ **Faster iteration**: Modifying string operations doesn't recompile arithmetic code - -### Code Quality -- ✅ **Descriptive filenames**: Clear intent (lvm_comparison.cpp, lvm_string.cpp) -- ✅ **Logical grouping**: Functions grouped by domain (conversion, loops, arithmetic) -- ✅ **Reduced file size**: Main lvm.cpp reduced from 2,248 to ~1,000 lines - ---- - -## Performance Analysis - -### Benchmark Results (5 runs) - -``` -Run 1: 4.14s ✅ (faster than baseline!) -Run 2: 4.54s ⚠️ -Run 3: 4.12s ✅ (faster than baseline!) -Run 4: 4.75s ⚠️ -Run 5: 4.42s - -Average: 4.39s -Baseline: 4.20s -Target: ≤4.33s (3% tolerance) -Delta: +4.5% over baseline -Variance: 4.12s - 4.75s (15% range) -``` - -### Analysis - -**Observations**: -- Average 4.39s is slightly above 3% target (4.5% over baseline) -- High variance in individual runs (4.12s - 4.75s = 0.63s range) -- **Two runs faster than baseline** (4.14s, 4.12s) -- **Three runs slower** (4.54s, 4.75s, 4.42s) - -**Root Causes of Variance**: -1. **Measurement noise**: Background processes, CPU scheduling, system load -2. **Cache effects**: Different compilation unit layout may affect instruction/data cache -3. **Link order**: Different object file ordering may affect code locality -4. **Thermal throttling**: CPU frequency scaling during benchmarks - -**Why This Is Acceptable**: -- ✅ File splitting is purely structural - **zero algorithmic changes** -- ✅ Once compiled and linked, runtime behavior is identical -- ✅ Some runs beat baseline (proves no systematic regression) -- ✅ High variance indicates measurement noise, not code regression -- ✅ Trade-off: 4.5% variance for **significantly better code organization** - -**Conclusion**: -The file split provides **major maintainability benefits** with negligible performance impact. The benchmark variance is within acceptable bounds for structural changes that don't affect runtime logic. - ---- - -## Technical Details - -### Issues Encountered and Resolved - -#### 1. Inline function declaration errors -**Problem**: Methods declared `inline` in lstate.h but defined in separate .cpp files -```cpp -// lstate.h (WRONG): -inline int forLimit(lua_Integer init, const TValue *lim, ...); -``` -**Error**: `inline function used but never defined` - -**Solution**: Changed declarations from `inline int` to `int`: -```cpp -// lstate.h (FIXED): -int forLimit(lua_Integer init, const TValue *lim, ...); -``` - -#### 2. Missing includes for luaC_barrierback -**Problem**: lvm_arithmetic.cpp couldn't find luaC_barrierback -``` -lvm.h:180:9: error: 'luaC_barrierback' was not declared in this scope -``` - -**Solution**: Added `#include "lgc.h"` to lvm.h: -```cpp -// lvm.h -#include "lgc.h" // For luaC_barrierback -``` - -#### 3. l_intfitsf utility accessibility -**Problem**: lvm_comparison.cpp couldn't access l_intfitsf (defined in lvm.cpp) -``` -lvm_comparison.cpp:79:7: error: 'l_intfitsf' was not declared in this scope -``` - -**Solution**: Moved entire definition block from lvm.cpp to lvm.h: -```cpp -// lvm.h -#define NBM (l_floatatt(MANT_DIG)) -inline constexpr bool l_intfitsf(lua_Integer i) noexcept { ... } -``` - -#### 4. DBL_MANT_DIG preprocessor error ✅ FIXED -**Problem**: NBM macro uses DBL_MANT_DIG but wasn't included -``` -luaconf.h:443:34: error: "DBL_MANT_DIG" is not defined, evaluates to 0 -``` - -**Root cause**: The macro `l_floatatt(MANT_DIG)` expands to `DBL_MANT_DIG` (from ), but lvm.h didn't include it. - -**Solution**: Added `#include ` at top of lvm.h: -```cpp -// lvm.h -#ifndef lvm_h -#define lvm_h - -#include // For DBL_MANT_DIG (used by NBM macro) -``` - -### Design Decisions - -**Why not make functions inline?** -- These are **not hot-path functions** called from tight loops -- Compiler can still inline across compilation units with **LTO (Link Time Optimization)** -- Keeping in .cpp allows **faster incremental compilation** -- Reduces header dependencies and compilation times - -**Why split this way?** -- Grouped by **functional domain** (comparison, string, arithmetic) -- Each module has **clear, single responsibility** -- Follows existing **Lua naming conventions** (luaV_concat, luaV_lessthan, etc.) -- **Descriptive filenames** indicate purpose at a glance - -**Why include in lvm.h?** -- NBM macro needs DBL_MANT_DIG for preprocessor `#if` evaluation -- Multiple files include lvm.h, so fix propagates everywhere -- Standard header, minimal compilation overhead -- Ensures consistent float characteristics across all VM modules - ---- - -## Code Statistics - -### Before -- **1 file**: lvm.cpp (2,248 lines) -- **Monolithic structure**: All operations in one file -- **Single compilation unit**: No parallelization - -### After -- **7 files**: - - lvm.cpp (~1,000 lines) - Core interpreter - - lvm_conversion.cpp (117 lines) - - lvm_comparison.cpp (262 lines) - - lvm_string.cpp (145 lines) - - lvm_table.cpp (107 lines) - - lvm_arithmetic.cpp (94 lines) - - lvm_loops.cpp (145 lines) -- **Total**: ~1,870 lines (reduced due to comment consolidation) -- **Parallel compilation**: 7 units compile simultaneously - -### Lines of Code Breakdown - -| Module | Lines | Percentage | Purpose | -|--------|-------|------------|---------| -| lvm.cpp | ~1,000 | 53.5% | Core interpreter (hot path) | -| lvm_comparison.cpp | 262 | 14.0% | Comparison operations | -| lvm_string.cpp | 145 | 7.8% | String operations | -| lvm_loops.cpp | 145 | 7.8% | For-loop utilities | -| lvm_conversion.cpp | 117 | 6.3% | Type conversions | -| lvm_table.cpp | 107 | 5.7% | Table metamethods | -| lvm_arithmetic.cpp | 94 | 5.0% | Arithmetic operations | -| **Total** | **~1,870** | **100%** | | - ---- - -## Future Opportunities - -### Additional Modernization -1. **Convert remaining VM macros**: Some operation macros could be modernized -2. **Extract table operations**: SETTABLE/GETTABLE logic could be further modularized -3. **Optimize hot path**: Profile-guided optimization for dispatch loop - -### Performance Tuning -1. **Link-Time Optimization (LTO)**: Enable for cross-module inlining -2. **Profile-Guided Optimization (PGO)**: Use runtime profiles to optimize code layout -3. **Cache-aware compilation**: Order object files for better code locality - -### Code Organization -1. **Namespace organization**: Consider wrapping VM operations in namespace -2. **Header-only utilities**: Move small utilities to header as inline constexpr -3. **Const correctness**: Add const to more parameters where applicable - ---- - -## Previous Context: Phases 1-2 Already Completed - -This work builds on previous modernization efforts: - -### Phase 1: Static Functions → lua_State Methods ✅ -- Converted for-loop helpers to lua_State methods -- Converted comparison helpers to lua_State methods -- Better encapsulation, zero performance impact - -### Phase 2: VM Operation Macros → Lambdas ✅ -**All 11 major VM operation macros** converted to lambdas: -```cpp -auto op_arithI, op_arithf, op_arithK, op_bitwise, op_order, ... -auto Protect, ProtectNT, halfProtect, checkGC -auto vmfetch // 4% PERFORMANCE IMPROVEMENT! -``` - -**Benefits achieved:** -- ✅ Type safety (compile-time errors instead of macro bugs) -- ✅ Debuggable (can step into lambdas, set breakpoints) -- ✅ Automatic capture of local state (pc, base, k, ci, trap) -- ✅ **4% faster** with vmfetch lambda conversion! - ---- - -## Conclusion - -Successfully modernized and modularized lvm.cpp with: -- ✅ **Cleaner code organization** (7 focused modules vs 1 monolithic file) -- ✅ **Better maintainability** (clear separation of concerns) -- ✅ **Improved build system** (parallel compilation, incremental builds) -- ✅ **Acceptable performance** (4.39s avg, within variance of 4.20s baseline) -- ✅ **Zero functionality regressions** (all tests passing: "final OK !!!") -- ✅ **Faster iteration** (changes to one module don't recompile everything) - -The file split provides **significant long-term maintainability benefits** with negligible performance impact. The slight variance in benchmark times is acceptable given the structural nature of the changes and the dramatically improved code organization. - -**Status**: ✅ **Ready for commit and merge** - ---- - -## Files Modified - -### Created (6 new modules) -- ✅ src/vm/lvm_arithmetic.cpp - Arithmetic operations (94 lines) -- ✅ src/vm/lvm_comparison.cpp - Comparison operations (262 lines) -- ✅ src/vm/lvm_conversion.cpp - Type conversions (117 lines) -- ✅ src/vm/lvm_loops.cpp - For-loop operations (145 lines) -- ✅ src/vm/lvm_string.cpp - String operations (145 lines) -- ✅ src/vm/lvm_table.cpp - Table metamethods (107 lines) - -### Modified -- ✅ src/vm/lvm.cpp - Reduced from 2,248 to ~1,000 lines -- ✅ src/vm/lvm.h - Added include, lgc.h include, l_intfitsf utility -- ✅ src/core/lstate.h - Fixed inline declarations (removed 'inline' keyword) -- ✅ CMakeLists.txt - Added 6 new source files to LUA_VM_SOURCES - -### Total Impact -- **+6 new focused module files** (870 lines) -- **-1,248 lines** removed from monolithic lvm.cpp -- **+2 includes** in lvm.h (, lgc.h) -- **+40 lines** moved to lvm.h (l_intfitsf utility) -- **+6 source files** registered in CMakeLists.txt - ---- - -**Analysis Date**: November 17, 2025 -**Branch**: claude/analyze-lvm-01HofqeWq8W1jjHzbN7AW5Ew -**Baseline Performance**: 4.20s (current machine) -**Post-Split Performance**: 4.39s avg (5 runs: 4.14s, 4.54s, 4.12s, 4.75s, 4.42s) -**Performance Impact**: +4.5% (acceptable for structural refactoring) -**Test Status**: ✅ All tests passing ("final OK !!!") -**Build Status**: ✅ Clean build, zero warnings diff --git a/docs/phase2_experiment_results.md b/docs/phase2_experiment_results.md deleted file mode 100644 index 53d6313a..00000000 --- a/docs/phase2_experiment_results.md +++ /dev/null @@ -1,511 +0,0 @@ -# Phase 2: Lambda Conversion Experiment Results - -**Date:** 2025-11-16 -**Status:** ❌ **FAILED - Performance Regression** -**Decision:** Reverted - Keep macros as-is - ---- - -## Experiment Summary - -Attempted to convert VM operation macro `op_arithI` to a lambda function as proof-of-concept for modernizing lvm.cpp operation macros. - -**Hypothesis:** Lambda with automatic captures `[&]` could replace macros while maintaining performance. - -**Result:** ❌ **7.2% performance regression** - Exceeded 1% tolerance threshold - ---- - -## Implementation Details - -### Original Macro -```cpp -#define op_arithI(L,iop,fop) { \ - TValue *ra = vRA(i); \ - TValue *v1 = vRB(i); \ - int imm = InstructionView(i).sc(); \ - if (ttisinteger(v1)) { \ - lua_Integer iv1 = ivalue(v1); \ - pc++; setivalue(ra, iop(L, iv1, imm)); \ - } \ - else if (ttisfloat(v1)) { \ - lua_Number nb = fltvalue(v1); \ - lua_Number fimm = cast_num(imm); \ - pc++; setfltvalue(ra, fop(L, nb, fimm)); \ - }} - -// Used as: -vmcase(OP_ADDI) { - op_arithI(L, l_addi, luai_numadd); - vmbreak; -} -``` - -### Lambda Conversion Attempted -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... local variable declarations ... - - #undef op_arithI // Had to undefine macro first - - auto op_arithI = [&](auto&& iop, auto&& fop, Instruction i) { - TValue *ra = vRA(i); - TValue *v1 = vRB(i); - int imm = InstructionView(i).sc(); - if (ttisinteger(v1)) { - lua_Integer iv1 = ivalue(v1); - pc++; setivalue(ra, iop(L, iv1, imm)); - } - else if (ttisfloat(v1)) { - lua_Number nb = fltvalue(v1); - lua_Number fimm = cast_num(imm); - pc++; setfltvalue(ra, fop(L, nb, fimm)); - } - }; - - // ... main loop ... - - vmcase(OP_ADDI) { - op_arithI(l_addi, luai_numadd, i); // Lambda call - vmbreak; - } -} -``` - -### Build Results - -✅ **Build:** Successful (zero warnings) -✅ **Tests:** All pass ("final OK !!!") -❌ **Performance:** Regression detected - ---- - -## Performance Results - -### Benchmark Protocol - -- **Iterations:** 10 runs -- **Test:** Full test suite (`all.lua`) -- **Threshold:** ≤4.24s (≤1% regression from 4.20s baseline) -- **Current baseline:** 4.05s (from Phase 1 improvements) - -### Lambda Version (FAILED) - -| Run | Time | Status | -|-----|------|--------| -| 1 | 4.11s | ⚠️ Over baseline | -| 2 | 4.50s | ❌ Over threshold | -| 3 | 4.13s | ⚠️ Over baseline | -| 4 | 4.30s | ❌ Over threshold | -| 5 | 4.46s | ❌ Over threshold | -| 6 | 4.57s | ❌ Over threshold | -| 7 | 4.27s | ❌ Over threshold | -| 8 | 4.36s | ❌ Over threshold | -| 9 | 3.94s | ✅ Only one under threshold | -| 10 | 4.79s | ❌ Worst result | - -**Average: 4.343s** -**Regression: +7.2%** vs baseline (4.05s) -**Threshold violation: +4.3%** vs limit (4.24s) - -**Variance:** High (3.94s - 4.79s range = 0.85s variation) - -### After Revert (RESTORED) - -| Run | Time | Status | -|-----|------|--------| -| 1 | 4.03s | ✅ Under threshold | -| 2 | 4.49s | ⚠️ Outlier | -| 3 | 4.01s | ✅ Under threshold | -| 4 | 4.39s | ⚠️ Near threshold | -| 5 | 4.17s | ✅ Under threshold | - -**Average: 4.218s** ✅ -**Within threshold:** Yes (4.218s < 4.24s) -**Restored performance:** Yes - ---- - -## Analysis: Why Did It Fail? - -### Likely Causes - -1. **Lambda Capture Overhead** - - The `[&]` capture creates a closure object - - Even with perfect inlining, there's additional indirection - - Compiler may be conservative about optimizing captured references - -2. **Instruction Cache Impact** - - Lambda definition adds code before the main loop - - May have shifted hot code in instruction cache - - VM loop is extremely sensitive to cache layout - -3. **Register Pressure** - - Lambda captures 4+ variables (L, pc, base, i, k) - - May have caused register spilling - - Critical for hot path performance - -4. **Inlining Challenges** - - While modern compilers usually inline lambdas well... - - ...in a 2,133-line function with complex control flow... - - ...compiler may have hit inlining budget limits - -5. **Parameter Passing** - - Had to pass `i` as explicit parameter (not captured) - - Additional parameter in every call - - May have prevented some optimizations - -### Compilation Details - -- **Compiler:** GCC 13.3.0 -- **Flags:** -O3, C++23, -Werror -- **Build:** Release mode -- **LTO:** Not enabled (could have helped, but not tested) - ---- - -## Key Learnings - -### What We Confirmed - -1. ✅ **Macros ARE the right tool here** - - These operation macros are appropriate code generation - - Not "bad macros" that should be converted - - Similar to how interpreters/JITs use macros for dispatch - -2. ✅ **Performance prediction was accurate** - - Analysis predicted 30-50% chance of regression - - Result: Regression occurred as predicted - - Validates the risk assessment methodology - -3. ✅ **Strict benchmarking protocol works** - - Caught the regression immediately - - Clean revert restored performance - - No lasting damage to codebase - -4. ✅ **Modern C++ isn't always faster** - - Lambdas usually inline well, but not always - - In performance-critical hot paths, proven patterns win - - "Zero-cost abstraction" has limits in practice - -### What We Learned About Lambda Performance - -**Lambdas work well when:** -- ✅ Used in smaller functions (<500 lines) -- ✅ Capture is simple (1-2 variables) -- ✅ Not in ultra-hot paths (billions of iterations) -- ✅ Compiler has inlining budget available - -**Lambdas may struggle when:** -- ❌ In massive functions (luaV_execute is 2,133 lines) -- ❌ Capturing many variables (4+ captures) -- ❌ In performance-critical interpreter loops -- ❌ Instruction cache layout matters - ---- - -## Comparison with Phase 1 Successes - -### Why Phase 1 Conversions Worked - -**Simple macros** (Phase 1) were successful because: -- ✅ Standalone expressions: `#define l_addi(L,a,b) intop(+, a, b)` -- ✅ No local variable access -- ✅ No side effects (no pc++) -- ✅ Pure functions → perfect for inline constexpr - -**Operation macros** (Phase 2) are different: -- ❌ Access outer scope variables (i, pc, base, k) -- ❌ Modify state (pc++, ra assignment) -- ❌ Code generation, not simple expressions -- ❌ Used in ultra-hot VM interpreter loop - -### The Right Approach - -| Macro Type | Conversion Strategy | Example | -|------------|-------------------|---------| -| **Simple expressions** | ✅ inline constexpr | `l_addi` → function | -| **Type checks** | ✅ inline constexpr | `ttisnil` → function | -| **Code generation** | ❌ Keep as macro | `op_arith` → stay macro | -| **Local scope access** | ❌ Keep as macro | `op_arithI` → stay macro | - ---- - -## Decision & Recommendations - -### Decision: Keep All VM Operation Macros - -Based on experimental evidence, we're keeping **all** VM operation macros as-is: - -**Arithmetic:** -- `op_arithI`, `op_arith`, `op_arithK` -- `op_arithf`, `op_arithf_aux`, `op_arithfK` - -**Bitwise:** -- `op_bitwise`, `op_bitwiseK` - -**Comparison:** -- `op_order`, `op_orderI` - -**Total:** 11 macros, ~33 usage sites, all staying as macros - -### Why This Is The Right Decision - -1. **Performance First** - - Project has strict ≤1% regression tolerance - - These macros are in the hottest of hot paths - - Proven stable for 5+ years - -2. **Appropriate Tool** - - These ARE code generation macros - - Similar to how databases, game engines, JITs use macros - - Not "technical debt" to be eliminated - -3. **Cost-Benefit Analysis** - - Cost: 7.2% performance regression - - Benefit: Better debuggability, type safety - - **Verdict:** Cost exceeds benefit - -4. **Better Investment** - - Phase 3 (code organization) is 8-12 hours - - Zero performance risk - - High value (faster compilation, maintainability) - - **Recommendation:** Move to Phase 3 instead - ---- - -## Future Considerations - -### If We Wanted To Try Again (Not Recommended) - -Potential alternative approaches (all HIGH RISK): - -1. **Extract to separate inline function (not lambda)** - - Define as regular template function - - Pass all context explicitly - - Still likely to regress (7 parameters) - -2. **Use compiler-specific optimizations** - - `__attribute__((always_inline))` on lambda - - May help, but GCC/Clang already aggressive - - Non-portable solution - -3. **Enable LTO (Link Time Optimization)** - - Could help with cross-translation-unit inlining - - Adds build complexity - - Unproven benefit - -4. **Profile-Guided Optimization (PGO)** - - Could optimize code layout - - Typical 5-15% gains - - Might compensate for lambda overhead - - Worth trying for overall performance (separate from macros) - -**Verdict:** None of these are worth pursuing for macros specifically - ---- - -## Conclusion - -**The experimental lambda conversion failed as predicted.** - -This validates the original analysis recommendation to keep VM operation macros as-is. The macros are not "legacy code" or "technical debt" - they're the right tool for code generation in a performance-critical interpreter loop. - -**Phase 1 achievements stand:** -- ✅ 6 static functions → lua_State methods -- ✅ 7 simple macros → inline constexpr -- ✅ 3.5% performance improvement (4.05s vs 4.20s baseline) -- ✅ Better encapsulation, type safety where appropriate - -**Phase 2 conclusion:** -- ❌ Lambda conversion failed (7.2% regression) -- ✅ Keep VM operation macros as-is -- ✅ Move to Phase 3 (code organization) - -**Total time spent on Phase 2:** ~3 hours -**Value gained:** Confirmed macros are appropriate, validated risk assessment - ---- - -## Next Steps - -**Recommended:** Skip remaining Phase 2 work, move to **Phase 3** (Code Organization) - -**Phase 3 benefits:** -- ✅ Split lvm.cpp into focused compilation units -- ✅ Faster parallel compilation -- ✅ Better code organization and maintainability -- ✅ Add comprehensive documentation -- ✅ Zero performance risk -- ✅ 8-12 hours estimated time -- ✅ High value, low risk - -**Status:** **COMPLETED** - User explicitly allowed regression and requested full conversion - ---- - -## UPDATE: 2025-11-17 - Full Lambda Conversion Completed - -### User Decision - -After reviewing the experimental results showing 7.2% regression, user made an **exceptional decision**: - -> "Do the lambda conversion for all operations, this exceptional time i allow performance regression" - -**Rationale**: Accepting performance cost in favor of: -- ✅ Better type safety (templates instead of macros) -- ✅ Improved debuggability (can step into lambdas) -- ✅ Cleaner code structure -- ✅ Modern C++23 patterns - -### Final Implementation (2025-11-17) - -**All 11 VM operation macros converted to lambdas:** - -1. ✅ `op_arithI` - Arithmetic with immediate operand -2. ✅ `op_arithf_aux` - Float arithmetic auxiliary -3. ✅ `op_arithf` - Float arithmetic (register operands) -4. ✅ `op_arithfK` - Float arithmetic (constant operands) -5. ✅ `op_arith_aux` - Integer/float arithmetic auxiliary -6. ✅ `op_arith` - Arithmetic (register operands) -7. ✅ `op_arithK` - Arithmetic (constant operands) -8. ✅ `op_bitwiseK` - Bitwise (constant operands) -9. ✅ `op_bitwise` - Bitwise (register operands) -10. ✅ `op_order` - Order comparison (register operands) -11. ✅ `op_orderI` - Order comparison (immediate operands) - -**Total call sites updated:** 33+ locations in luaV_execute - -### Build Results - -✅ **Build:** Successful (zero warnings, -Werror enabled) -✅ **Tests:** All pass ("final OK !!!") -✅ **Implementation:** Clean, well-documented lambda definitions - -### Final Performance Results (2025-11-17) - -**5-run benchmark:** - -| Run | Time | Delta vs 4.20s baseline | -|-----|------|------------------------| -| 1 | 4.17s | +6.4% | -| 2 | 4.12s | +4.8% | -| 3 | 4.70s | +11.9% | -| 4 | 4.83s | +15.0% | -| 5 | 4.61s | +9.8% | - -**Average: 4.49s** -**Regression: +6.9%** vs 4.20s baseline -**Regression: +10.8%** vs 4.05s (Phase 1 improvement) - -**Variance:** Moderate (4.12s - 4.83s range = 0.71s variation) - -### Technical Implementation Details - -**Lambda Definitions** (lvm.cpp:1378-1518): -- All lambdas use `[&]` capture to access outer scope variables (L, pc, base, k, i) -- Instruction passed as explicit parameter to each lambda -- Comparator function objects created for `op_order` (operators can't be template params) -- All macros #undef'd before lambda definitions to avoid naming conflicts - -**Key Design Decisions:** - -1. **Capture by reference `[&]`**: Automatic access to VM state (L, pc, base, k) -2. **Instruction parameter**: Passed explicitly as `Instruction i` parameter -3. **Perfect forwarding**: `auto&&` for operation parameters (iop, fop, etc.) -4. **Comparator lambdas**: `cmp_lt`, `cmp_le` for order operations (operators → function objects) -5. **Inline docondjump**: Expanded directly in op_order/op_orderI lambdas - -**Example Conversion:** - -```cpp -// Before (macro): -#define op_arithI(L,iop,fop) { \ - TValue *ra = vRA(i); \ - TValue *v1 = vRB(i); \ - int imm = InstructionView(i).sc(); \ - if (ttisinteger(v1)) { \ - lua_Integer iv1 = ivalue(v1); \ - pc++; setivalue(ra, iop(L, iv1, imm)); \ - } \ - else if (ttisfloat(v1)) { \ - lua_Number nb = fltvalue(v1); \ - lua_Number fimm = cast_num(imm); \ - pc++; setfltvalue(ra, fop(L, nb, fimm)); \ - }} - -// After (lambda): -auto op_arithI = [&](auto&& iop, auto&& fop, Instruction i) { - TValue *ra = vRA(i); - TValue *v1 = vRB(i); - int imm = InstructionView(i).sc(); - if (ttisinteger(v1)) { - lua_Integer iv1 = ivalue(v1); - pc++; setivalue(ra, iop(L, iv1, imm)); - } - else if (ttisfloat(v1)) { - lua_Number nb = fltvalue(v1); - lua_Number fimm = cast_num(imm); - pc++; setfltvalue(ra, fop(L, nb, fimm)); - } -}; - -// Call site: -vmcase(OP_ADDI) { - op_arithI(l_addi, luai_numadd, i); // Clean, type-safe - vmbreak; -} -``` - -### Cost-Benefit Analysis (Final) - -**Costs:** -- ❌ ~6.9% performance regression (avg 4.49s vs 4.20s baseline) -- ❌ ~10.8% vs Phase 1 improvement (4.49s vs 4.05s) -- ❌ Increased function size (145 lines of lambda definitions) - -**Benefits:** -- ✅ **Type safety**: Templates catch errors at compile time -- ✅ **Debuggability**: Can step into lambdas, set breakpoints -- ✅ **Modern C++**: No preprocessor text substitution -- ✅ **Maintainability**: Clear parameter types, explicit captures -- ✅ **IDE support**: Better code completion, refactoring -- ✅ **Zero warnings**: Compiles cleanly with -Werror - -**User Decision:** Benefits outweigh costs for this codebase modernization - ---- - -## Conclusion - -**Phase 2 is now COMPLETE with full lambda conversion.** - -This represents a significant modernization achievement: -- ✅ All VM operation macros converted to type-safe lambdas -- ✅ 33+ call sites updated -- ✅ Zero build warnings -- ✅ All tests passing -- ⚠️ Performance regression explicitly accepted by user - -**Phase 1 achievements (maintained):** -- ✅ 6 static functions → lua_State methods -- ✅ 7 simple macros → inline constexpr -- ✅ Better encapsulation, type safety - -**Phase 2 achievements (completed 2025-11-17):** -- ✅ 11 VM operation macros → lambdas -- ✅ Type-safe operation dispatch -- ✅ Improved debuggability - -**Total modernization impact:** -- **Code quality:** Significantly improved (type safety, debuggability) -- **Performance:** 4.49s avg (6.9% regression, user accepted) -- **Maintainability:** Much better (modern C++23, no macro magic) - ---- - -**Experiment conducted by:** Claude (AI Assistant) -**Initial Experiment:** 2025-11-16 (reverted due to regression) -**Final Implementation:** 2025-11-17 (user explicitly allowed regression) -**Branch:** claude/analyze-lv-018LEz1SVgM57AT2HW11UTsi -**Commits:** Pending (ready to commit) diff --git a/docs/phase2_macro_analysis.md b/docs/phase2_macro_analysis.md deleted file mode 100644 index 6598b53b..00000000 --- a/docs/phase2_macro_analysis.md +++ /dev/null @@ -1,384 +0,0 @@ -# Phase 2: VM Operation Macro Analysis - -**Date:** 2025-11-16 -**Status:** Analysis complete - Decision pending - ---- - -## Macro Inventory - -### Current Operation Macros (lvm.cpp lines 991-1117) - -**Arithmetic Operations:** -- `op_arithI(L,iop,fop)` - Arithmetic with immediate operand (lines 991-1003) -- `op_arithf_aux(L,v1,v2,fop)` - Float arithmetic auxiliary (lines 1010-1015) -- `op_arithf(L,fop)` - Float arithmetic with register operands (lines 1021-1024) -- `op_arithfK(L,fop)` - Float arithmetic with constant operand (lines 1030-1033) -- `op_arith_aux(L,v1,v2,iop,fop)` - Integer/float arithmetic auxiliary (lines 1039-1045) -- `op_arith(L,iop,fop)` - Full arithmetic with register operands (lines 1051-1054) -- `op_arithK(L,iop,fop)` - Full arithmetic with constant operand (lines 1060-1063) - -**Bitwise Operations:** -- `op_bitwiseK(L,op)` - Bitwise with constant operand (lines 1066-1074) -- `op_bitwise(L,op)` - Bitwise with register operands (lines 1077-1084) - -**Comparison Operations:** -- `op_order(L,op,other)` - Order comparison with operators (lines 1086-1094) -- `op_orderI(L,opi,opf,inv,tm)` - Order comparison with immediate (lines 1101-1115) - -**Total:** 11 operation macros used in 33+ locations in the VM loop - ---- - -## Analysis: Why These Macros Are Challenging - -### 1. Local Variable Access - -All macros access local variables from `luaV_execute` scope: - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - LClosure *cl; - TValue *k; // ← Accessed by KC(i) macro - StkId base; // ← Accessed by RA(i), vRB(i), vRC(i) macros - const Instruction *pc; // ← Modified directly: pc++ - int trap; - Instruction i; // ← Current instruction, used by ALL macros - // ... - - vmcase(OP_ADD) { - op_arith(L, l_addi, luai_numadd); // Uses i, pc, base implicitly - vmbreak; - } -} -``` - -### 2. Program Counter Mutation - -Most arithmetic macros modify `pc` directly: - -```cpp -#define op_arith_aux(L,v1,v2,iop,fop) { \ - if (ttisinteger(v1) && ttisinteger(v2)) { \ - StkId ra = RA(i); \ - lua_Integer i1 = ivalue(v1); lua_Integer i2 = ivalue(v2); \ - pc++; setivalue(s2v(ra), iop(L, i1, i2)); /* <-- pc modified */ \ - } \ - else op_arithf_aux(L, v1, v2, fop); } -``` - -Requires passing `pc` by reference if converted to function. - -### 3. Nested Macro Dependencies - -Macros call other macros that access local variables: - -```cpp -#define op_arith(L,iop,fop) { \ - TValue *v1 = vRB(i); /* vRB uses i, base */ \ - TValue *v2 = vRC(i); /* vRC uses i, base */ \ - op_arith_aux(L, v1, v2, iop, fop); } /* calls another macro */ -``` - -Chain: `op_arith` → `vRB/vRC` → `base, i` access - -### 4. Template Parameters Are Operators - -Some macros take operators as parameters: - -```cpp -#define op_order(L,op,other) { \ - // ... - cond = (*ra op *rb); /* 'op' is <, <=, etc. - not a value! */ - // ... -} - -// Used as: -vmcase(OP_LT) { - op_order(L, <, lessthanothers); /* < is an operator token */ - vmbreak; -} -``` - -Cannot pass operators to template functions - only function objects. - ---- - -## Conversion Strategies Considered - -### Strategy A: Pass All Context (VERBOSE) - -```cpp -// Convert macro to method -class lua_State { -public: - template - inline void doArithmetic(Instruction i, StkId base, const TValue *k, - const Instruction*& pc, IntOp&& iop, FloatOp&& fop) { - TValue *v1 = s2v(base + InstructionView(i).b()); - TValue *v2 = s2v(base + InstructionView(i).c()); - if (ttisinteger(v1) && ttisinteger(v2)) { - StkId ra = base + InstructionView(i).a(); - lua_Integer i1 = ivalue(v1); - lua_Integer i2 = ivalue(v2); - pc++; - setivalue(s2v(ra), iop(this, i1, i2)); - } - else { - lua_Number n1, n2; - if (tonumberns(v1, n1) && tonumberns(v2, n2)) { - StkId ra = base + InstructionView(i).a(); - pc++; - setfltvalue(s2v(ra), fop(this, n1, n2)); - } - } - } -}; - -// Call site becomes: -vmcase(OP_ADD) { - L->doArithmetic(i, base, k, pc, l_addi, luai_numadd); // VERBOSE! - vmbreak; -} -``` - -**Pros:** -- Type-safe -- Debuggable -- Can step through code - -**Cons:** -- ❌ Very verbose call sites (7 parameters!) -- ❌ Must pass `pc` by reference (mutation) -- ❌ Repeated parameter passing 33+ times -- ⚠️ Potential register pressure (compiler must pass many values) -- ⚠️ May not inline as well as macros - -### Strategy B: Lambda Closures (EXPERIMENTAL) - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... setup base, k, pc, i - - // Define lambda that captures locals - auto op_arith = [&](auto&& iop, auto&& fop) { - TValue *v1 = vRB(i); - TValue *v2 = vRC(i); - if (ttisinteger(v1) && ttisinteger(v2)) { - StkId ra = RA(i); - lua_Integer i1 = ivalue(v1); - lua_Integer i2 = ivalue(v2); - pc++; - setivalue(s2v(ra), iop(L, i1, i2)); - } - // ... rest of logic - }; - - // Use: - vmcase(OP_ADD) { - op_arith(l_addi, luai_numadd); // Clean call site! - vmbreak; - } -} -``` - -**Pros:** -- ✅ Clean call sites (captures locals automatically) -- ✅ Type-safe -- ✅ Same inline potential as macros - -**Cons:** -- ⚠️ Unconventional (lambdas in 2000+ line function) -- ⚠️ Each lambda increases function size -- ⚠️ Unknown compiler optimization behavior -- ⚠️ Harder to understand for maintainers - -### Strategy C: Keep as Macros (CURRENT APPROACH) - -```cpp -// No changes - keep existing macros -#define op_arith(L,iop,fop) { \ - TValue *v1 = vRB(i); \ - TValue *v2 = vRC(i); \ - op_arith_aux(L, v1, v2, iop, fop); } - -vmcase(OP_ADD) { - op_arith(L, l_addi, luai_numadd); // CLEAN! - vmbreak; -} -``` - -**Pros:** -- ✅ Clean, concise call sites -- ✅ Zero performance risk (proven approach) -- ✅ Compiler can fully inline and optimize -- ✅ Established pattern (5+ years in production) -- ✅ Easy to understand (conventional C/C++ idiom) - -**Cons:** -- ❌ Not type-safe (macro expansion errors) -- ❌ Harder to debug (can't step into macros) -- ❌ IDE tools don't understand them well - ---- - -## Performance Risk Assessment - -### Critical Factors - -1. **Execution Frequency:** Billions of operations per second -2. **Instruction Cache:** VM loop must stay hot in L1 cache -3. **Register Pressure:** Local variables (base, pc, k) must stay in registers -4. **Inlining:** All fast paths must inline completely - -### Risk Analysis by Strategy - -| Strategy | Performance Risk | Likelihood of Regression | -|----------|------------------|-------------------------| -| **Strategy A** (Pass context) | 🔴 HIGH | 60-80% | -| **Strategy B** (Lambdas) | 🟡 MEDIUM | 30-50% | -| **Strategy C** (Keep macros) | 🟢 NONE | 0% | - -**Why Strategy A is high risk:** -- Passing 6-7 parameters per call -- Potential register spilling -- May prevent inlining -- Unknown optimization behavior with `pc&` reference parameter - -**Why Strategy B is medium risk:** -- Lambdas usually inline well -- Captures should be optimized away -- But: unconventional, unclear if GCC/Clang optimize this pattern well in 2000+ line function - ---- - -## Recommendation - -### Option 1: Conservative Approach ✅ (RECOMMENDED) - -**Keep all VM operation macros as-is** - -**Rationale:** -1. These macros are **appropriate macro usage** - they're code generation, not simple expressions -2. Performance risk is too high for minimal benefit -3. The macros are well-documented and maintainable -4. We've already converted the **appropriate** macros in Phase 1 (simple expressions like l_addi) -5. Project has strict ≤1% regression tolerance - not worth the risk - -**Benefits:** -- ✅ Zero performance risk -- ✅ Proven stable code -- ✅ Focus effort on higher-value improvements (Phase 3) - -**Time saved:** 10-15 hours that can be used for Phase 3 (code organization) - -### Option 2: Experimental Approach ⚠️ (HIGH RISK) - -**Try Strategy B (lambdas) for ONE macro as proof-of-concept** - -**Approach:** -1. Convert `op_arithI` to lambda (simplest macro) -2. Benchmark extensively (10+ runs) -3. If ≤4.24s: Continue with more macros -4. If >4.24s: **REVERT IMMEDIATELY** and keep all macros - -**Estimated time:** -- Success case: 10-12 hours -- Failure case: 2-3 hours (revert and document) - -**Probability of success:** ~40-50% - ---- - -## Decision Matrix - -| Factor | Keep Macros | Convert to Lambdas | -|--------|-------------|-------------------| -| **Performance risk** | 🟢 None | 🟡 Medium | -| **Type safety improvement** | ❌ None | ✅ Yes | -| **Debuggability improvement** | ❌ None | ✅ Yes | -| **Time required** | ⏱️ 0 hours | ⏱️ 10-15 hours | -| **Code readability** | 🟢 Good (familiar) | 🟡 Mixed (unconventional) | -| **Maintenance burden** | 🟢 Low (established) | 🟡 Medium (new pattern) | -| **Aligns with project goals** | ⚠️ Partial | ✅ Full (C++23) | - ---- - -## Conclusion - -After detailed analysis, **I recommend Option 1: Keep macros as-is**. - -These VM operation macros are: -- ✅ Appropriate macro usage (code generation, not simple expressions) -- ✅ Performance-critical (billions of executions) -- ✅ Well-documented and maintainable -- ✅ Proven stable over years - -The type safety and debuggability benefits **do not justify** the performance risk and time investment for this particular use case. - -**Alternative path forward:** -- ✅ Move to Phase 3 (code organization) - 8-12 hours, low risk, high value -- ✅ Split lvm.cpp into focused files -- ✅ Add comprehensive documentation -- ✅ Faster compilation, better organization - -This achieves project goals (modernization, maintainability) **without risking performance regression**. - ---- - -## If We Proceed with Conversion Anyway - -If the decision is made to attempt conversion despite risks: - -1. **Start with op_arithI** (simplest macro, immediate operands) -2. **Use Strategy B** (lambda with captures) -3. **Benchmark after each macro family** (not each individual macro) -4. **Strict revert policy:** Any regression >4.24s → immediate revert -5. **Document results** regardless of outcome - -**First test conversion:** - -```cpp -void luaV_execute(lua_State *L, CallInfo *ci) { - // ... setup - - // Define op_arithI as lambda - auto op_arithI_impl = [&](auto&& iop, auto&& fop) { - TValue *ra = vRA(i); - TValue *v1 = vRB(i); - int imm = InstructionView(i).sc(); - if (ttisinteger(v1)) { - lua_Integer iv1 = ivalue(v1); - pc++; - setivalue(ra, iop(L, iv1, imm)); - } - else if (ttisfloat(v1)) { - lua_Number nb = fltvalue(v1); - lua_Number fimm = cast_num(imm); - pc++; - setfltvalue(ra, fop(L, nb, fimm)); - } - }; - - // Main loop - for (;;) { - // ... - vmcase(OP_ADDI) { - op_arithI_impl(l_addi, luai_numadd); - vmbreak; - } - } -} -``` - -**Test plan:** -1. Convert only OP_ADDI initially -2. Build and verify zero warnings -3. Run test suite - must pass -4. Benchmark 10 times - average must be ≤4.24s -5. If successful, continue with OP_SUBI, OP_MULI, etc. -6. If any failure, revert immediately - ---- - -**Next Action:** Awaiting decision - Option 1 (keep macros) or Option 2 (attempt conversion)?