diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c6392a751a..dc35d8823c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -35,7 +35,7 @@ jobs: strategy: fail-fast: false matrix: - platform: [X64, RISCV, RK3588, ARM64, ANDROID, TERMUX, LARCH64, ANDROID_GLIBC, WOW64, ARM64-GCC-8] + platform: [X64, RISCV, RK3588, ARM64, ANDROID, TERMUX, LARCH64, PPC64LE, ANDROID_GLIBC, WOW64, ARM64-GCC-8] type: [Release, Trace, StaticBuild, Box32] exclude: - platform: ANDROID @@ -64,6 +64,8 @@ jobs: type: Trace - platform: LARCH64 type: Trace + - platform: PPC64LE + type: Trace include: - platform: X64 os: ubuntu-latest @@ -86,6 +88,9 @@ jobs: - platform: LARCH64 os: ubuntu-latest + - platform: PPC64LE + os: ubuntu-latest + - platform: ANDROID_GLIBC os: ubuntu-22.04-arm @@ -122,7 +127,7 @@ jobs: if: steps.changed-files-dir-names.outputs.all_modified_files == 'src/dynarec/arm64' continue-on-error: true run: | - if [[ ${{ matrix.platform }} != 'RISCV' && ${{ matrix.platform }} != 'LARCH64' && ${{ matrix.platform }} != 'X64' ]]; then + if [[ ${{ matrix.platform }} != 'RISCV' && ${{ matrix.platform }} != 'LARCH64' && ${{ matrix.platform }} != 'PPC64LE' && ${{ matrix.platform }} != 'X64' ]]; then exit 1 fi @@ -148,18 +153,29 @@ jobs: exit 1 fi + - name: Early exit other jobs if this is a PPC64LE-only change + id: early-exit-ppc64le + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + if: steps.changed-files-dir-names.outputs.all_modified_files == 'src/dynarec/ppc64le' + continue-on-error: true + run: | + if [[ ${{ matrix.platform }} == 'PPC64LE' ]]; then + exit 1 + fi + - name: Do not early exit otherwise id: early-exit-never env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} DIRS: ${{ steps.changed-files-dir-names.outputs.all_modified_files }} - if: env.DIRS != 'src/dynarec/arm64' && env.DIRS != 'src/dynarec/la64' && env.DIRS != 'src/dynarec/rv64' + if: env.DIRS != 'src/dynarec/arm64' && env.DIRS != 'src/dynarec/la64' && env.DIRS != 'src/dynarec/rv64' && env.DIRS != 'src/dynarec/ppc64le' continue-on-error: true run: exit 1 - name: Merge early exit conditions id: early-exit - if: steps.early-exit-arm64.outcome == 'failure' || steps.early-exit-la64.outcome == 'failure' || steps.early-exit-rv64.outcome == 'failure' || steps.early-exit-never.outcome == 'failure' + if: steps.early-exit-arm64.outcome == 'failure' || steps.early-exit-la64.outcome == 'failure' || steps.early-exit-rv64.outcome == 'failure' || steps.early-exit-ppc64le.outcome == 'failure' || steps.early-exit-never.outcome == 'failure' continue-on-error: true run: exit 1 @@ -175,7 +191,7 @@ jobs: sudo apt-get update zydis_package=libzydis-dev fi - if [[ ${{ matrix.platform }} != 'X64' && ${{ matrix.platform }} != 'RISCV' && ${{ matrix.platform }} != 'LARCH64' ]]; then + if [[ ${{ matrix.platform }} != 'X64' && ${{ matrix.platform }} != 'RISCV' && ${{ matrix.platform }} != 'LARCH64' && ${{ matrix.platform }} != 'PPC64LE' ]]; then sudo apt-get -y install git cmake make python3 patchelf $zydis_package if [[ ${{ matrix.platform }} == 'ANDROID' || ${{ matrix.platform }} == 'TERMUX' ]]; then sudo apt-get -y install p7zip @@ -253,6 +269,10 @@ jobs: echo BOX64_PLATFORM_MARCRO="-DLARCH64=ON" >> $GITHUB_ENV echo "BOX64_COMPILER=loongarch64-linux-gnu-gcc" >> $GITHUB_ENV sudo apt-get -y install git cmake make python3 libzydis-dev + elif [[ ${{ matrix.platform }} == 'PPC64LE' ]]; then + echo BOX64_PLATFORM_MARCRO="-DPPC64LE=ON" >> $GITHUB_ENV + echo "BOX64_COMPILER=powerpc64le-linux-gnu-gcc" >> $GITHUB_ENV + sudo apt-get -y install git gcc-powerpc64le-linux-gnu cmake make python3 libzydis-dev else echo BOX64_PLATFORM_MARCRO="-DARM_DYNAREC=ON" >> $GITHUB_ENV echo "BOX64_COMPILER=aarch64-linux-gnu-gcc" >> $GITHUB_ENV @@ -391,6 +411,14 @@ jobs: BOX64_DYNAREC_TEST=1 ctest $CTEST_OPTIONS -E nocosim BOX64_DYNAREC_TEST=1 BOX64_DYNAREC_NOHOSTEXT=1 ctest $CTEST_OPTIONS -E nocosim + elif [[ ${{ matrix.platform }} == 'PPC64LE' ]]; then + export INTERPRETER=qemu-ppc64le-static + export QEMU_LD_PREFIX=/usr/powerpc64le-linux-gnu/ + + ctest $CTEST_OPTIONS -E nocosim + BOX64_DYNAREC=0 ctest $CTEST_OPTIONS -E nocosim + + BOX64_DYNAREC_TEST=1 ctest $CTEST_OPTIONS -E nocosim elif [[ ${{ matrix.platform }} == 'ANDROID' ]]; then export INTERPRETER=qemu-aarch64-static export QEMU_LD_PREFIX=/system/lib64 diff --git a/CMakeLists.txt b/CMakeLists.txt index ed39e4278b..1881ee55c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ if(LARCH64) set(ARM_DYNAREC OFF CACHE BOOL "") set(RV64_DYNAREC OFF CACHE BOOL "") set(LARCH64_DYNAREC ON CACHE BOOL "") + set(PPC64LE_DYNAREC OFF CACHE BOOL "") set(BAD_PKILL ON CACHE BOOL "") if(LARCH64_ABI_1) message(STATUS "Build for Loongarch64 ABI 1.0 system") @@ -81,6 +82,7 @@ if(RV64) set(ARM_DYNAREC OFF CACHE BOOL "") set(RV64_DYNAREC ON CACHE BOOL "") set(LARCH64_DYNAREC OFF CACHE BOOL "") + set(PPC64LE_DYNAREC OFF CACHE BOOL "") endif() if(PPC64LE) set(LD80BITS OFF CACHE BOOL "") @@ -88,6 +90,7 @@ if(PPC64LE) set(ARM_DYNAREC OFF CACHE BOOL "") set(RV64_DYNAREC OFF CACHE BOOL "") set(LARCH64_DYNAREC OFF CACHE BOOL "") + set(PPC64LE_DYNAREC ON CACHE BOOL "") endif() if(RK3399 OR RK3588 OR ODROIDN2 OR RPI3ARM64 OR RPI4ARM64 OR RPI5ARM64 OR RK3326 OR TEGRAX1 OR TEGRA_T194 OR TEGRA_T234 OR NVIDIA_GB10 OR PHYTIUM OR SD845 OR SD865 OR SD888 OR SD8G2 OR SDORYON1 OR SD8EG5 OR LX2160A OR M1 OR ARM64 OR ADLINK) set(LD80BITS OFF CACHE BOOL "") @@ -95,6 +98,7 @@ if(RK3399 OR RK3588 OR ODROIDN2 OR RPI3ARM64 OR RPI4ARM64 OR RPI5ARM64 OR RK3326 set(ARM_DYNAREC ON CACHE BOOL "") set(RV64_DYNAREC OFF CACHE BOOL "") set(LARCH64_DYNAREC OFF CACHE BOOL "") + set(PPC64LE_DYNAREC OFF CACHE BOOL "") endif() if(RK3399 OR ODROIDN2 OR RPI3ARM64 OR RPI4ARM64 OR RPI5ARM64) set(SAVE_MEM ON CACHE BOOL "") @@ -122,6 +126,7 @@ option(NOALIGN "Set to ON if host device doesn't need re-align (i.e. i386)" ${NO option(ARM_DYNAREC "Set to ON to use ARM Dynamic Recompilation" ${ARM_DYNAREC}) option(RV64_DYNAREC "Set to ON to use RISC-V Dynamic Recompilation" ${RV64_DYNAREC}) option(LARCH64_DYNAREC "Set to ON to use LOONGARCH64 Dynamic Recompilation" ${LARCH64_DYNAREC}) +option(PPC64LE_DYNAREC "Set to ON to use PPC64LE Dynamic Recompilation" ${PPC64LE_DYNAREC}) option(STATICBUILD "Set to ON to have a static build (Warning, not working)" ${STATICBUILD}) option(NO_LIB_INSTALL "Set ON to not install a few x86_64 libs that are used by many program" ${NO_LIB_INSTALL}) option(NO_CONF_INSTALL "Set ON to not install config files" ${NO_CONF_INSTALL}) @@ -251,6 +256,8 @@ elseif(RV64) set(ASMFLAGS -pipe -march=rv64gc) elseif(PPC64LE) add_definitions(-DPPC64LE) + set(CFLAGS -pipe -mcpu=power9) + set(ASMFLAGS -pipe -mcpu=power9) elseif(LX2160A) add_definitions(-DLX2160A) set(CFLAGS -pipe -march=armv8-a+crypto+crc -mcpu=cortex-a72+crypto) @@ -292,6 +299,9 @@ if(STATICBUILD) set(BUILD_SHARED_LIBS OFF) set(CMAKE_EXE_LINKER_FLAGS -static) set(HAVE_TRACE OFF) + if(PPC64LE) + add_definitions(-fno-stack-protector) + endif() endif() if(BOX32) add_definitions(-DBOX32) @@ -348,6 +358,12 @@ elseif(LARCH64_DYNAREC) enable_language(ASM) include_directories("${BOX64_ROOT}/src/dynarec/la64") set(DYNAREC ON) +elseif(PPC64LE_DYNAREC) + add_definitions(-DDYNAREC) + add_definitions(-DPPC64LE) + enable_language(ASM) + include_directories("${BOX64_ROOT}/src/dynarec/ppc64le") + set(DYNAREC ON) else() set(DYNAREC OFF) endif() @@ -848,7 +864,7 @@ if(NOT CI) OUTPUT "${BOX64_ROOT}/src/wrapped/generated/functions_list.txt" COMMAND "${PYTHON_EXECUTABLE}" "${BOX64_ROOT}/rebuild_wrappers.py" "${BOX64_ROOT}" - "PANDORA" "HAVE_LD80BITS" "NOALIGN" "HAVE_TRACE" "ANDROID" "TERMUX" "STATICBUILD" "LA64" "--" + "PANDORA" "HAVE_LD80BITS" "NOALIGN" "HAVE_TRACE" "ANDROID" "TERMUX" "STATICBUILD" "LA64" "PPC64LE" "--" ${WRAPPEDS_HEAD} MAIN_DEPENDENCY "${BOX64_ROOT}/rebuild_wrappers.py" DEPENDS ${WRAPPEDS} ${WRAPPEDS_HEAD} @@ -1150,6 +1166,58 @@ if(LARCH64_DYNAREC) ) endif() +if(PPC64LE_DYNAREC) + set(DYNAREC_SRC + ${DYNAREC_SRC} + + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_functions.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_arch.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/ppc64le_printer.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_jmpnext.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_consts.c" + ) + set(DYNAREC_ASM + ${DYNAREC_ASM} + "${BOX64_ROOT}/src/dynarec/ppc64le/ppc64le_prolog.S" + "${BOX64_ROOT}/src/dynarec/ppc64le/ppc64le_epilog.S" + "${BOX64_ROOT}/src/dynarec/ppc64le/ppc64le_next.S" + "${BOX64_ROOT}/src/dynarec/ppc64le/ppc64le_lock.S" + ) + + set(DYNAREC_PASS + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_helper.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_00.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_0f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_f0.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_f20f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_f30f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_66.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_660f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_66f0.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_66f20f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_66f30f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_d8.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_d9.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_da.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_db.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_dc.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_dd.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_de.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_df.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f38.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f38.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f3a.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f38.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f3a.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f.c" + "${BOX64_ROOT}/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f38.c" + ) +endif() + if(GDBJIT) set(GDBJITREADER "${BOX64_ROOT}/gdbjit/reader.c") @@ -1291,8 +1359,12 @@ string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "i686" _x86) string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "x86_64" _x86_64) string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "aarch64" _aarch64) string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "riscv64" _riscv64) +string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "ppc64le" _ppc64le) +if(NOT _ppc64le) + string(COMPARE EQUAL "${CMAKE_SYSTEM_PROCESSOR}" "powerpc64le" _ppc64le) +endif() -if(_x86_64 OR _aarch64) +if(_x86_64 OR _aarch64 OR _ppc64le) add_definitions(-DCONFIG_64BIT) endif() @@ -1461,6 +1533,8 @@ if(_aarch64) endif() elseif(_riscv64) set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "riscv64") +elseif(_ppc64le) + set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "ppc64el") elseif(_x86_64) set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "x86_64") else() diff --git a/src/build_info.h b/src/build_info.h index 251d9e46e4..0d1802a7bb 100644 --- a/src/build_info.h +++ b/src/build_info.h @@ -9,13 +9,15 @@ #define __BUILD_INFO_H__ -#if defined(DYNAREC) && (defined(ARM64) || defined(RV64) || defined(LA64)) +#if defined(DYNAREC) && (defined(ARM64) || defined(RV64) || defined(LA64) || defined(PPC64LE)) #if defined(ARM64) #define ARCH_STR " arm64" #elif defined(RV64) #define ARCH_STR " riscv64" #elif defined(LA64) #define ARCH_STR " loongarch64" +#elif defined(PPC64LE) +#define ARCH_STR " ppc64le" #endif #else #define ARCH_STR "" diff --git a/src/dynarec/dynacache_reloc.h b/src/dynarec/dynacache_reloc.h index 63b8b772a3..3d35b07caa 100644 --- a/src/dynarec/dynacache_reloc.h +++ b/src/dynarec/dynacache_reloc.h @@ -10,6 +10,9 @@ #elif defined(LA64) #include "dynarec/la64/dynarec_la64_consts.h" #define native_consts_t la64_consts_t +#elif defined(PPC64LE) +#include "dynarec/ppc64le/dynarec_ppc64le_consts.h" +#define native_consts_t ppc64le_consts_t #else #error Unsupported architecture #endif diff --git a/src/dynarec/dynarec.c b/src/dynarec/dynarec.c index dcf386ae90..9e20507272 100644 --- a/src/dynarec/dynarec.c +++ b/src/dynarec/dynarec.c @@ -164,14 +164,14 @@ void EmuRun(x64emu_t* emu, int use_dynarec) JUMPBUFF jmpbuf[1] = {0}; int skip = 0; JUMPBUFF *old_jmpbuf = emu->jmpbuf; - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) uintptr_t old_savesp = emu->xSPSave; #endif int is32bits = (emu->segs[_CS]==0x23); while(!(emu->quit)) { if(!emu->jmpbuf || (emu->flags.need_jmpbuf && emu->jmpbuf!=jmpbuf)) { emu->jmpbuf = jmpbuf; - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) emu->old_savedsp = emu->xSPSave; #endif emu->flags.jmpbuf_ready = 1; @@ -265,7 +265,7 @@ void EmuRun(x64emu_t* emu, int use_dynarec) } // clear the setjmp emu->jmpbuf = old_jmpbuf; - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) emu->xSPSave = old_savesp; #endif } diff --git a/src/dynarec/dynarec_arch.h b/src/dynarec/dynarec_arch.h index 6aa5eadd27..c6c4912b66 100644 --- a/src/dynarec/dynarec_arch.h +++ b/src/dynarec/dynarec_arch.h @@ -104,6 +104,45 @@ extern uint32_t la64_crc(void* p, uint32_t len); #define STOP_NATIVE_FLAGS(A, B) {} #define ARCH_UNALIGNED(A, B) arch_unaligned(A, B) #define JMPNEXT_SIZE (4*sizeof(void*)) + +#elif defined(PPC64LE) + +#define instruction_native_t instruction_ppc64le_t +#define dynarec_native_t dynarec_ppc64le_t +#define extcache_native_t vmxcache_t + +#define ADDITIONNAL_DEFINITION() \ + int fpuCacheNeedsTransform(dynarec_native_t* dyn, int ninst); + +#define OTHER_CACHE() \ + if (fpuCacheNeedsTransform(dyn, ninst)) ret |= 2; + +#include "ppc64le/ppc64le_printer.h" +#include "ppc64le/dynarec_ppc64le_private.h" +#include "ppc64le/dynarec_ppc64le_functions.h" +#include "ppc64le/dynarec_ppc64le_arch.h" +// Limit here is unconditional branch (I-form), signed 26bits (24-bit field << 2), so ±32MB +#define MAXBLOCK_SIZE ((1 << 24) - 200) + +#define RAZ_SPECIFIC(A, N) +#define UPDATE_SPECIFICS(A) propagateFpuBarrier(A) +#define PREUPDATE_SPECIFICS(A) updateNativeFlags(A) +#define POSTUPDATE_SPECIFICS(A) + +#define ARCH_SIZE(A) get_size_arch(A) +#define ARCH_FILL(A, B, C) populate_arch(A, B, C) +#define ARCH_ADJUST(A, B, C, D) adjust_arch(A, B, C, D) +#define STOP_NATIVE_FLAGS(A, B) {} +#define ARCH_UNALIGNED(A, B) arch_unaligned(A, B) +extern uint32_t ppc64le_fast_hash(void* p, uint32_t len); +#define ARCH_CRC(A, B) return ppc64le_fast_hash(A, B) + +#define ARCH_NOP 0x60000000 /* ori 0,0,0 */ +#define ARCH_UDF 0x00000000 /* illegal instruction (all zeros) */ +// PPC64LE CreateJmpNext needs 5 instructions (20 bytes) for PC-relative load + branch, +// so the jmpnext area needs 5 void* slots (40 bytes) instead of the default 4 (32 bytes). +#define JMPNEXT_SIZE (5*sizeof(void*)) + #else #error Unsupported platform #endif diff --git a/src/dynarec/dynarec_helper.h b/src/dynarec/dynarec_helper.h index d7883d23f0..85007b2d93 100644 --- a/src/dynarec/dynarec_helper.h +++ b/src/dynarec/dynarec_helper.h @@ -221,6 +221,8 @@ int is_addr_autosmc(uintptr_t addr); #include "la64/dynarec_la64_helper.h" #elif defined(RV64) #include "rv64/dynarec_rv64_helper.h" +#elif defined(PPC64LE) +#include "ppc64le/dynarec_ppc64le_helper.h" #else #error Unsupported architecture #endif diff --git a/src/dynarec/dynarec_native_functions.c b/src/dynarec/dynarec_native_functions.c index e8afc769a4..ac7ac8e2fd 100644 --- a/src/dynarec/dynarec_native_functions.c +++ b/src/dynarec/dynarec_native_functions.c @@ -755,7 +755,7 @@ static int flagsCacheNeedsTransform(dynarec_native_t* dyn, int ninst) { int jmp = dyn->insts[ninst].x64.jmp_insts; if(jmp<0) return 0; - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) // df_none is now a defered information if(dyn->insts[ninst].f_exit==dyn->insts[jmp].f_entry) return 0; diff --git a/src/dynarec/dynarec_native_pass.c b/src/dynarec/dynarec_native_pass.c index 14b1b0963c..2e2d35bdf1 100644 --- a/src/dynarec/dynarec_native_pass.c +++ b/src/dynarec/dynarec_native_pass.c @@ -44,7 +44,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int int rep = 0; // 0 none, 1=F2 prefix, 2=F3 prefix int need_epilog = 1; // Clean up (because there are multiple passes) - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) dyn->f = status_unk; #else dyn->f.pending = 0; @@ -111,7 +111,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int dyn->last_ip = 0; if(reset_n==-2) { MESSAGE(LOG_DEBUG, "Reset Caches to zero\n"); - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) dyn->f = status_unk; #else dyn->f.dfnone = 0; @@ -128,7 +128,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } if(dyn->insts[ninst].x64.barrier&BARRIER_FLAGS) { MESSAGE(LOG_DEBUG, "Apply Barrier Flags\n"); - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) dyn->f = status_unk; #else dyn->f.dfnone = 0; @@ -252,7 +252,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int #if STEP > 0 if(dyn->insts[ninst].x64.has_next && dyn->insts[next].x64.barrier) { if(dyn->insts[next].x64.barrier&BARRIER_FLOAT) { - #if defined (RV64) || defined(LA64) + #if defined (RV64) || defined(LA64) || defined(PPC64LE) uint8_t tmp1, tmp2, tmp3; if(dyn->insts[next].nat_flags_fusion) get_free_scratch(dyn, next, &tmp1, &tmp2, &tmp3, x1, x2, x3, x4, x5); else { tmp1=x1; tmp2=x2; tmp3=x3; } @@ -262,7 +262,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int #endif } if(dyn->insts[next].x64.barrier&BARRIER_FLAGS) { - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) dyn->f = status_unk; #else dyn->f.pending = 0; @@ -288,7 +288,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int // we use the 1st predecessor here if((ninst+1)size && !dyn->insts[ninst+1].x64.alive) { // reset fpu value... - #if defined(ARM64) || defined(LA64) + #if defined(ARM64) || defined(LA64) || defined(PPC64LE) dyn->f = status_unk; #else dyn->f.dfnone = 0; @@ -442,7 +442,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int #endif ++ninst; NOTEST(x3); - #if defined (RV64) || defined(LA64) + #if defined (RV64) || defined(LA64) || defined(PPC64LE) fpu_purgecache(dyn, ninst, 0, x1, x2, x3); #else fpu_purgecache(dyn, ninst, 0, x1, x2, x3, 0); @@ -453,7 +453,7 @@ uintptr_t native_pass(dynarec_native_t* dyn, uintptr_t addr, int alternate, int } if(need_epilog) { NOTEST(x3); - #if defined (RV64) || defined(LA64) + #if defined (RV64) || defined(LA64) || defined(PPC64LE) fpu_purgecache(dyn, ninst, 0, x1, x2, x3); #else fpu_purgecache(dyn, ninst, 0, x1, x2, x3, 0); diff --git a/src/dynarec/dynarec_next.h b/src/dynarec/dynarec_next.h index 301ec4a506..fa072fee80 100644 --- a/src/dynarec/dynarec_next.h +++ b/src/dynarec/dynarec_next.h @@ -26,6 +26,13 @@ void rv64_epilog_fast(void) EXPORTDYN; #define native_prolog rv64_prolog #define native_epilog rv64_epilog #define native_epilog_fast rv64_epilog_fast +#elif defined(PPC64LE) +void ppc64le_next(void) EXPORTDYN; +void ppc64le_prolog(x64emu_t* emu, void* addr) EXPORTDYN; +void ppc64le_epilog(void) EXPORTDYN; +#define native_next ppc64le_next +#define native_prolog ppc64le_prolog +#define native_epilog ppc64le_epilog #else #error Unsupported architecture #endif diff --git a/src/dynarec/native_lock.h b/src/dynarec/native_lock.h index ba4cb13180..c3ec1c17ed 100644 --- a/src/dynarec/native_lock.h +++ b/src/dynarec/native_lock.h @@ -120,6 +120,44 @@ #define native_lock_get_d(A) la64_lock_get_d(A) #define native_lock_get_dd(A) la64_lock_get_dd(A) +#elif defined(PPC64LE) +#include "ppc64le/ppc64le_lock.h" + +// PPC64LE has LL/SC at all widths (lbarx/stbcx., lharx/sthcx., lwarx/stwcx., ldarx/stdcx.) +// so it follows the ARM64 separate read/write pattern, NOT the CAS pattern + +#define native_lock_read_b(A) ppc64le_lock_read_b(A) +#define native_lock_write_b(A, B) ppc64le_lock_write_b(A, B) +#define native_lock_read_h(A) ppc64le_lock_read_h(A) +#define native_lock_write_h(A, B) ppc64le_lock_write_h(A, B) +#define native_lock_read_d(A) ppc64le_lock_read_d(A) +#define native_lock_write_d(A, B) ppc64le_lock_write_d(A, B) +#define native_lock_read_dd(A) ppc64le_lock_read_dd(A) +#define native_lock_write_dd(A, B) ppc64le_lock_write_dd(A, B) +// PPC64LE (pre-POWER10) lacks 128-bit LL/SC. Use plain reads for read_dq; +// callers that need true atomicity should use the spinlock (mutex_16b) path. +#define native_lock_read_dq(A, B, C) do { *(A) = ((uint64_t*)(C))[0]; *(B) = ((uint64_t*)(C))[1]; } while(0) +#define native_lock_write_dq(A, B, C) 0 // not truly atomic; callers must use mutex_16b +#define native_lock_xchg_dd(A, B) ppc64le_lock_xchg_dd(A, B) +#define native_lock_xchg_d(A, B) ppc64le_lock_xchg_d(A, B) +#define native_lock_xchg_h(A, B) ppc64le_lock_xchg_h(A, B) +#define native_lock_xchg_b(A, B) ppc64le_lock_xchg_b(A, B) +#define native_lock_storeifref(A, B, C) ppc64le_lock_storeifref(A, B, C) +#define native_lock_storeifref2(A, B, C) ppc64le_lock_storeifref2(A, B, C) +#define native_lock_storeifref_d(A, B, C) ppc64le_lock_storeifref_d(A, B, C) +#define native_lock_storeifref2_d(A, B, C) ppc64le_lock_storeifref2_d(A, B, C) +#define native_lock_storeifnull(A, B) ppc64le_lock_storeifnull(A, B) +#define native_lock_storeifnull_d(A, B) ppc64le_lock_storeifnull_d(A, B) +// #define native_lock_decifnot0b(A) ppc64le_lock_decifnot0b(A) +#define native_lock_storeb(A, B) ppc64le_lock_storeb(A, B) +#define native_lock_incif0(A) ppc64le_lock_incif0(A) +#define native_lock_decifnot0(A) ppc64le_lock_decifnot0(A) +#define native_lock_store(A, B) ppc64le_lock_store(A, B) +#define native_lock_store_dd(A, B) ppc64le_lock_store_dd(A, B) +#define native_lock_get_b(A) ppc64le_lock_get_b(A) +#define native_lock_get_d(A) ppc64le_lock_get_d(A) +#define native_lock_get_dd(A) ppc64le_lock_get_dd(A) + #else #error Unsupported architecture #endif diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_00.c b/src/dynarec/ppc64le/dynarec_ppc64le_00.c new file mode 100644 index 0000000000..54863d3533 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_00.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +#include "ppc64le_mapping.h" +#include "x64_signals.h" +#include "os.h" +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "bridge.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "custommem.h" +#include "alternate.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" + +int isSimpleWrapper(wrapper_t fun); +int isRetX87Wrapper(wrapper_t fun); + +uintptr_t dynarec64_00(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + uint8_t nextop, opcode; + uint8_t gd, ed, tmp1, tmp2, tmp3; + int64_t j64; + int v0, v1; + int i32; + int64_t i64; + int32_t tmp; + MAYUSE(tmp1); + MAYUSE(tmp2); + MAYUSE(tmp3); + MAYUSE(j64); + MAYUSE(v0); + MAYUSE(v1); + MAYUSE(i32); + MAYUSE(i64); + MAYUSE(tmp); + + opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_0f.c b/src/dynarec/ppc64le/dynarec_ppc64le_0f.c new file mode 100644 index 0000000000..768f1d5aa7 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_0f.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" +#include "emu/x64shaext.h" +#include "bitutils.h" +#include "freq.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "elfloader.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + + +uintptr_t dynarec64_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_66.c b/src/dynarec/ppc64le/dynarec_ppc64le_66.c new file mode 100644 index 0000000000..ed52aab3a1 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_66.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "custommem.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_66(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_660f.c b/src/dynarec/ppc64le/dynarec_ppc64le_660f.c new file mode 100644 index 0000000000..f7ed83795f --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_660f.c @@ -0,0 +1,38 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "bitutils.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_660F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_66f0.c b/src/dynarec/ppc64le/dynarec_ppc64le_66f0.c new file mode 100644 index 0000000000..619e65b936 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_66f0.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_66F0(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_66f20f.c b/src/dynarec/ppc64le/dynarec_ppc64le_66f20f.c new file mode 100644 index 0000000000..3c52fb5838 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_66f20f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_66F20F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_66f30f.c b/src/dynarec/ppc64le/dynarec_ppc64le_66f30f.c new file mode 100644 index 0000000000..c61ab9fdf4 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_66f30f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_66F30F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_arch.c b/src/dynarec/ppc64le/dynarec_ppc64le_arch.c new file mode 100644 index 0000000000..14d901ccb7 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_arch.c @@ -0,0 +1,399 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "dynablock.h" +#include "x64emu.h" +#include "emu/x64emu_private.h" +#include "emu/x64run_private.h" +#include "dynarec/dynablock_private.h" +#include "dynarec_ppc64le_arch.h" +#include "dynarec_ppc64le_functions.h" +#include "dynarec_native.h" + +// PPC64LE arch.c — signal recovery metadata +// Records per-instruction state so that adjust_arch() can reconstruct +// x86 emulator state (deferred flags, SIMD/x87/MMX register cache) when +// a signal interrupts dynarec-generated code. + +//order might be important, so define SUPER for the right one +#define SUPER() \ + GO(flags) \ + GO(x87) \ + GO(mmx) \ + GO(sse) \ + GO(ymm) \ + + +typedef struct arch_flags_s +{ + uint8_t defered:1; +} arch_flags_t; + +#define X87_ST_D 0 +#define X87_ST_F 1 +#define X87_ST_I64 2 +typedef struct arch_x87_s +{ + int8_t delta; //up to +/-7 + uint8_t x87; // 1bit is STx present + uint16_t x87_type; // 2bits per STx type + uint32_t x87_pos; //4bits per STx position +} arch_x87_t; + +typedef struct arch_mmx_s +{ + uint8_t mmx; //1bit for each mmx reg present +} arch_mmx_t; + +typedef struct arch_sse_s +{ + uint16_t sse; //1bit for each sse reg present +} arch_sse_t; + +typedef struct arch_ymm_s +{ + uint16_t ymm0; // 1bit for ymm0 + uint16_t ymm; // 1bit for each ymm present + uint64_t ymm_pos; // 4bits for position of each ymm present +} arch_ymm_t; + +typedef struct arch_arch_s +{ + #define GO(A) uint16_t A:1; + SUPER() + #undef GO + uint16_t unaligned:1; + uint16_t seq:10; // how many instruction on the same values +} arch_arch_t; + +typedef struct arch_build_s +{ + #define GO(A) uint8_t A:1; + SUPER() + #undef GO + uint8_t unaligned; + #define GO(A) arch_##A##_t A##_; + SUPER() + #undef GO +} arch_build_t; + +static arch_build_t static_build[MAX_INSTS+2] = {0}; + +// VMX cache index constants (must match dynarec_ppc64le_functions.c) +#define XMM0 0 +#define EMM0 (XMM0 + 16) + +static int arch_build(dynarec_ppc64le_t* dyn, int ninst, arch_build_t* arch, int noarch) +{ + memset(arch, 0, sizeof(arch_build_t)); + // flags + if(dyn->insts[ninst].f_entry!=status_none && dyn->insts[ninst].f_entry!=status_none_pending) { + arch->flags = 1; + arch->flags_.defered = 1; + } + // opcode can handle unaligned + arch->unaligned = dyn->insts[ninst].unaligned; + if(!noarch) { + // go through all vmxcache to gather regs assignments + for(int i=0; i<32; ++i) + if(dyn->insts[ninst].v.vmxcache[i].v) + switch(dyn->insts[ninst].v.vmxcache[i].t) { + case VMX_CACHE_XMMW: + arch->sse = 1; + arch->sse_.sse |= 1<insts[ninst].v.vmxcache[i].n; + break; + case VMX_CACHE_MM: + arch->mmx = 1; + arch->mmx_.mmx |= 1<insts[ninst].v.vmxcache[i].n; + break; + case VMX_CACHE_YMMW: + arch->ymm = 1; + arch->ymm_.ymm |= 1<insts[ninst].v.vmxcache[i].n; + { + int idx = i; + if(idx>=EMM0 && idx<=EMM0+8) + idx-=EMM0; + else + idx-=SCRATCH0-8; + arch->ymm_.ymm_pos |= (uint64_t)idx<<(dyn->insts[ninst].v.vmxcache[i].n*4); + } + break; + case VMX_CACHE_ST_D: + arch->x87 = 1; + arch->x87_.x87 |= 1<insts[ninst].v.vmxcache[i].n; + arch->x87_.x87_pos |= (uint32_t)(i-EMM0)<<(dyn->insts[ninst].v.vmxcache[i].n*4); + arch->x87_.x87_type |= (uint16_t)(X87_ST_D)<<(dyn->insts[ninst].v.vmxcache[i].n*2); + break; + case VMX_CACHE_ST_F: + arch->x87 = 1; + arch->x87_.x87 |= 1<insts[ninst].v.vmxcache[i].n; + arch->x87_.x87_pos |= (uint32_t)(i-EMM0)<<(dyn->insts[ninst].v.vmxcache[i].n*4); + arch->x87_.x87_type |= (uint16_t)(X87_ST_F)<<(dyn->insts[ninst].v.vmxcache[i].n*2); + break; + case VMX_CACHE_ST_I64: + arch->x87 = 1; + arch->x87_.x87 |= 1<insts[ninst].v.vmxcache[i].n; + arch->x87_.x87_pos |= (uint32_t)(i-EMM0)<<(dyn->insts[ninst].v.vmxcache[i].n*4); + arch->x87_.x87_type |= (uint16_t)(X87_ST_I64)<<(dyn->insts[ninst].v.vmxcache[i].n*2); + break; + case VMX_CACHE_XMMR: + case VMX_CACHE_YMMR: + default: + // doing nothing, it's just a value read in memory + break; + } + // ymm0 + if(dyn->insts[ninst].ymm0_out) { + arch->ymm = 1; + arch->ymm_.ymm0 = dyn->insts[ninst].ymm0_out; + } + // x87 top + if(dyn->insts[ninst].v.x87stack) { + arch->x87 = 1; + arch->x87_.delta = dyn->insts[ninst].v.x87stack; + } + } + return arch->flags + arch->x87 + arch->mmx + arch->sse + arch->ymm + arch->unaligned; +} + +static int sizeof_arch(arch_arch_t* arch) +{ + int sz = sizeof(arch_arch_t); + #define GO(A) if(arch->A) sz+=sizeof(arch_##A##_t); + SUPER() + #undef GO + return sz; +} + +static int sizeof_arch_build(arch_build_t* build) +{ + int sz = sizeof(arch_arch_t); + #define GO(A) if(build->A) sz+=sizeof(arch_##A##_t); + SUPER() + #undef GO + return sz; +} + +size_t get_size_arch(dynarec_ppc64le_t* dyn) +{ + int noarch = BOX64ENV(dynarec_noarch); + if(noarch>1) return 0; + arch_build_t* previous = NULL; + size_t sz = 0; + int seq = 0; + int nseq = 0; + int last = 0; + if(!dyn->size) return 0; + for(int i=0; isize; ++i) { + arch_build_t* build = static_build+i; + last = arch_build(dyn, i, build, noarch); + if(i && (!memcmp(build, previous, sizeof(arch_build_t))) && (seq<((1<<10)-1))) { + // same sequence, increment + ++seq; + } else { + seq = 0; + ++nseq; + previous = build; + sz += sizeof_arch_build(build); + } + } + if(nseq==1 && !last) + return 0; //empty, no flags, no nothing + return sz; +} + +static void build_next(arch_arch_t* arch, arch_build_t* build) +{ + #define GO(A) arch->A = build->A; + SUPER() + #undef GO + arch->unaligned = build->unaligned; + arch->seq = 0; + void* p = ((void*)arch)+sizeof(arch_arch_t); + #define GO(A) \ + if(build->A) { \ + memcpy(p, &(build->A##_), sizeof(arch_ ##A##_t)); \ + p+=sizeof(arch_##A##_t); \ + } + SUPER() + #undef GO +} + +void* populate_arch(dynarec_ppc64le_t* dyn, void* p, size_t tot_sz) +{ + arch_build_t* previous = NULL; + arch_arch_t* arch = p; + arch_arch_t* next = p; + int seq = 0; + size_t total = 0; + if(!tot_sz) return NULL; + for(int i=0; isize; ++i) { + arch_build_t* build = static_build+i; + if(i && (!memcmp(build, previous, sizeof(arch_build_t))) && (seq<((1<<10)-1))) { + // same sequence, increment + arch->seq = ++seq; + } else { + int sz = sizeof_arch_build(build); + if(total+sz>tot_sz) { + printf_log(LOG_INFO, "Warning: populate_arch on undersized buffer (%d+%d/%d, inst %d/%d)\n", total, sz, tot_sz, i, dyn->size); + return NULL; + } + arch = next; + build_next(arch, build); + seq = 0; + previous = build; + total += sz; + next = (arch_arch_t*)((uintptr_t)arch+sz); + } + } + return p; +} + +#ifndef _WIN32 +void adjust_arch(dynablock_t* db, x64emu_t* emu, ucontext_t* p, uintptr_t x64pc) +{ + if(!db->arch_size || !db->arch) + return; + int ninst = getX64AddressInst(db, x64pc); + dynarec_log(LOG_INFO, "adjust_arch(...), db=%p, x64pc=%p, nints=%d", db, (void*)x64pc, ninst); + if(ninst<0) { + dynarec_log_prefix(0, LOG_INFO, "\n"); + return; + } + if(ninst==0) { + dynarec_log_prefix(0, LOG_INFO, "\n"); + CHECK_FLAGS(emu); + return; + } + // look for state at ninst-1 + arch_arch_t* arch = db->arch; + arch_arch_t* next = arch; + #define GO(A) arch_##A##_t* A = NULL; + SUPER() + #undef GO + int i = 0; + while(i<(ninst)) { // this will look for ninst-1 state + arch = next; + dynarec_log_prefix(0, (i<(ninst-1))?LOG_DEBUG:LOG_INFO, "[ seq=%d%s%s%s%s%s ] ", arch->seq, arch->flags?" Flags":"", arch->x87?" x87":"", arch->mmx?" MMX":"", arch->sse?" SSE":"", arch->ymm?" YMM":""); + i += 1+arch->seq; + next = (arch_arch_t*)((uintptr_t)next + sizeof_arch(arch)); + } + int sz = sizeof(arch_arch_t); + #define GO(A) \ + if(arch->A) { \ + A = (arch_##A##_t*)((uintptr_t)arch + sz); \ + sz+=sizeof(arch_##A##_t); \ + } + SUPER() + #undef GO + if(flags) { + dynarec_log_prefix(0, LOG_INFO, " flags[%s] ", flags->defered?"defered":""); + if(flags->defered) { + CHECK_FLAGS(emu); + } + } + // PPC64LE signal context: + // mcontext_t.gp_regs[0..31] = r0-r31 + // mcontext_t.fp_regs[0..31] = f0-f31 (lower 64 bits of vs0-vs31) + // mcontext_t.v_regs->vrregs[0..31][4] = vr0-vr31 (= vs32-vs63, 128-bit) + // + // VMX cache mapping: + // XMM cache idx 0-15 → vr0-vr15 → v_regs->vrregs[0..15] + // MMX cache idx 16-23 → vr24-vr31 → v_regs->vrregs[24..31] (64-bit in low dword) + // x87 cache idx 16-23 → f16-f23 → fp_regs[16..23] (scalar 64-bit) + // Scratch idx 24-31 → vr16-vr23 → v_regs->vrregs[16..23] + mcontext_t* mc = &p->uc_mcontext; + vrregset_t* vmx = mc->v_regs; + + if(sse) { + dynarec_log_prefix(0, LOG_INFO, " sse[%x (vmx=%p)] ", sse->sse, vmx); + for(int i=0; i<16; ++i) + if(vmx && (sse->sse>>i)&1) { + // XMM i is cached in vmx register. Find which physical register. + // For XMM, cache index = i, physical = vr_i (vrregs[i]) + int idx = i; // cache index = xmm number for XMM + // On PPC64LE LE: vrregs[idx][0]=low32, [1]=next32, [2]=next32, [3]=high32 + // This matches x86 element order directly + memcpy(&emu->xmm[i], &vmx->vrregs[idx], 16); + } + } + if(ymm) { + dynarec_log_prefix(0, LOG_INFO, " ymm[%x, pos=%x, 0=%x (vmx=%p)] ", ymm->ymm, ymm->ymm_pos, ymm->ymm0, vmx); + for(int i=0; i<16; ++i) { + if(vmx && (ymm->ymm>>i)&1) { + int idx = (ymm->ymm_pos>>(i*4))&0xf; + // idx is a cache-relative position: + // 0-7 → EMM0+idx → vs(16+idx) → fp_regs but these are 64-bit only + // 8-15 → SCRATCH0+(idx-8) → vr(16+idx-8) → vrregs[16+idx-8] + if(idx >= 8) { + int vr = 16 + idx - 8; + memcpy(&emu->ymm[i], &vmx->vrregs[vr], 16); + } + // If idx < 8 (EMM0 space), those are vs16-vs23 in FPR space, + // only 64-bit accessible via fp_regs — not sufficient for 128-bit ymm. + // In practice, ymm should always use scratch (VMX) space. + } + if(ymm->ymm0&(1<ymm[i], 0, 16); + } + } + if(mmx) { + dynarec_log_prefix(0, LOG_INFO, " mmx[%x (vmx=%p)] ", mmx->mmx, vmx); + for(int i=0; i<8; ++i) + if(vmx && (mmx->mmx>>i)&1) { + // MMX i is cached in vr(24+i) = vs(56+i) → v_regs->vrregs[24+i] + // Data is in low 64 bits (bytes 0-7 on LE) + int vr_idx = 24 + i; + emu->mmx[i].q = *(uint64_t*)&vmx->vrregs[vr_idx]; + } + } + if(x87) { + dynarec_log_prefix(0, LOG_INFO, " x87[%x, pos=%x, type=%x] ", x87->x87, x87->x87_pos, x87->x87_type); + emu->top -= x87->delta; + for(int i=0; i<8; ++i) { + if(x87->x87&(1<x87_pos>>(i*4))&0x0f; + int fpr_idx = 16 + pos; // EMM0 + pos → vs(16+pos) → fp_regs[16+pos] + int t = (x87->x87_type>>(i*2))&0x3; + switch (t) { + case X87_ST_F: + emu->x87[(emu->top+i)&7].d = *(float*)&mc->fp_regs[fpr_idx]; + break; + case X87_ST_I64: + emu->x87[(emu->top+i)&7].d = *(int64_t*)&mc->fp_regs[fpr_idx]; + break; + case X87_ST_D: + emu->x87[(emu->top+i)&7].d = *(double*)&mc->fp_regs[fpr_idx]; + break; + } + } + } + } + dynarec_log_prefix(0, LOG_INFO, "\n"); +} +#endif + +int arch_unaligned(dynablock_t* db, uintptr_t x64pc) +{ + if(!db) + return 0; + if(!db->arch_size || !db->arch) + return 0; + int ninst = getX64AddressInst(db, x64pc); + if(ninst<0) { + return 0; + } + // look for state at ninst + arch_arch_t* arch = db->arch; + arch_arch_t* next = arch; + int i = -1; + while(iseq; + next = (arch_arch_t*)((uintptr_t)next + sizeof_arch(arch)); + } + return arch->unaligned; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_arch.h b/src/dynarec/ppc64le/dynarec_ppc64le_arch.h new file mode 100644 index 0000000000..7c9b01a017 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_arch.h @@ -0,0 +1,24 @@ +#ifndef __DYNAREC_PPC64LE_ARCH_H__ +#define __DYNAREC_PPC64LE_ARCH_H__ + +#include + +#include "x64emu.h" +#include "box64context.h" +#include "box64cpu.h" +#include "dynarec_ppc64le_private.h" + +// get size of arch specific info (can be 0) +size_t get_size_arch(dynarec_ppc64le_t* dyn); +//populate the array +void* populate_arch(dynarec_ppc64le_t* dyn, void* p, size_t sz); +#ifndef _WIN32 +#include +//adjust flags and more +void adjust_arch(dynablock_t* db, x64emu_t* emu, ucontext_t* p, uintptr_t x64pc); +#else +#define adjust_arch(db, emu, p, x64pc) +#endif +// get if instruction can be regenerated for unaligned access +int arch_unaligned(dynablock_t* db, uintptr_t x64pc); +#endif // __DYNAREC_PPC64LE_ARCH_H__ diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx.c new file mode 100644 index 0000000000..bad0c01560 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx.c @@ -0,0 +1,33 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + rex_t rex = vex.rex; + + DEFAULT; + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f.c new file mode 100644 index 0000000000..9ce815de41 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f38.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f38.c new file mode 100644 index 0000000000..60ebaa0ea2 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_0f38.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f.c new file mode 100644 index 0000000000..ce98eb0f03 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_66_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f38.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f38.c new file mode 100644 index 0000000000..26680d48fb --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f38.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_66_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f3a.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f3a.c new file mode 100644 index 0000000000..d42803229b --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_66_0f3a.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_66_0F3A(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f.c new file mode 100644 index 0000000000..5b646dfcc6 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_F2_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f38.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f38.c new file mode 100644 index 0000000000..c971a30b2e --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f38.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_F2_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f3a.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f3a.c new file mode 100644 index 0000000000..8ea08fdf4e --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f2_0f3a.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_F2_0F3A(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f.c new file mode 100644 index 0000000000..f6d5689114 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_F3_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f38.c b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f38.c new file mode 100644 index 0000000000..da4339a97a --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_avx_f3_0f38.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_helper.h" + +uintptr_t dynarec64_AVX_F3_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + rex_t rex = vex.rex; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_consts.c b/src/dynarec/ppc64le/dynarec_ppc64le_consts.c new file mode 100644 index 0000000000..eab9c68afd --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_consts.c @@ -0,0 +1,148 @@ +#include +#include + +#include "dynarec_ppc64le_consts.h" +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "emu/x87emu_private.h" +#include "emu/x64primop.h" +#include "my_cpuid.h" +#include "freq.h" +#include "debug.h" +#include "custommem.h" +#include "dynarec_ppc64le_functions.h" +#include "emu/x64shaext.h" +#include "emu/x87emu_private.h" +#include "emu/x64compstrings.h" +#include "x64test.h" +#include "dynarec/dynarec_next.h" +#include "random.h" + +static const int8_t mask_shift8[] = { -7, -6, -5, -4, -3, -2, -1, 0 }; +static const int8_t mask_string8[] = { 7, 6, 5, 4, 3, 2, 1, 0 }; +static const int8_t mask_string16[] = { 15, 14, 13, 12, 11, 10, 9, 8 }; +static const float addsubps[4] = {-1.f, 1.f, -1.f, 1.f}; +static const double addsubpd[2] = {-1., 1.}; +static const float subaddps[4] = {1.f, -1.f, 1.f, -1.f}; +static const double subaddpd[2] = {1., -1.}; + +#ifndef HAVE_TRACE +void PrintTrace() {} +#endif + +uintptr_t getConst(ppc64le_consts_t which) +{ + switch(which) { + case const_none: dynarec_log(LOG_NONE, "Warning, const none used\n"); + return 0; + case const_daa8: return (uintptr_t)daa8; + case const_das8: return (uintptr_t)das8; + case const_aaa16: return (uintptr_t)aaa16; + case const_aas16: return (uintptr_t)aas16; + case const_aam16: return (uintptr_t)aam16; + case const_aad16: return (uintptr_t)aad16; + case const_native_br: return (uintptr_t)native_br; + case const_native_ud: return (uintptr_t)native_ud; + case const_native_priv: return (uintptr_t)native_priv; + case const_native_gpf: return (uintptr_t)native_gpf; + case const_native_int3: return (uintptr_t)native_int3; + case const_native_int: return (uintptr_t)native_int; + case const_native_div0: return (uintptr_t)native_div0; + case const_native_frstor16: return (uintptr_t)native_frstor16; + case const_native_fsave16: return (uintptr_t)native_fsave16; + case const_native_fsave: return (uintptr_t)native_fsave; + case const_native_aesimc: return (uintptr_t)native_aesimc; + case const_native_aesd: return (uintptr_t)native_aesd; + case const_native_aesd_y: return (uintptr_t)native_aesd_y; + case const_native_aesdlast: return (uintptr_t)native_aesdlast; + case const_native_aesdlast_y: return (uintptr_t)native_aesdlast_y; + case const_native_aese: return (uintptr_t)native_aese; + case const_native_aese_y: return (uintptr_t)native_aese_y; + case const_native_aeselast: return (uintptr_t)native_aeselast; + case const_native_aeselast_y: return (uintptr_t)native_aeselast_y; + case const_native_aeskeygenassist: return (uintptr_t)native_aeskeygenassist; + case const_native_pclmul: return (uintptr_t)native_pclmul; + case const_native_pclmul_x: return (uintptr_t)native_pclmul_x; + case const_native_pclmul_y: return (uintptr_t)native_pclmul_y; + case const_direct_f2xm1: return (uintptr_t)direct_f2xm1; + case const_direct_fyl2x: return (uintptr_t)direct_fyl2x; + case const_direct_fyl2xp1: return (uintptr_t)direct_fyl2xp1; + case const_native_fxtract: return (uintptr_t)native_fxtract; + case const_direct_ftan: return (uintptr_t)direct_ftan; + case const_direct_fpatan: return (uintptr_t)direct_fpatan; + case const_direct_fcos: return (uintptr_t)direct_fcos; + case const_direct_fsin: return (uintptr_t)direct_fsin; + case const_native_fsincos: return (uintptr_t)native_fsincos; + case const_direct_fscale: return (uintptr_t)direct_fscale; + case const_native_fprem: return (uintptr_t)native_fprem; + case const_native_fprem1: return (uintptr_t)native_fprem1; + case const_native_fld: return (uintptr_t)native_fld; + case const_native_fstp: return (uintptr_t)native_fstp; + case const_native_frstor: return (uintptr_t)native_frstor; + case const_native_next: return (uintptr_t)native_next; + case const_int3: return (uintptr_t)EmuInt3; + case const_x86syscall: return (uintptr_t)EmuX86Syscall; + case const_x64syscall: return (uintptr_t)EmuX64Syscall; + case const_x64syscall_linux: return (uintptr_t)EmuX64Syscall_linux; + case const_rcl16: return (uintptr_t)rcl16; + case const_rcl32: return (uintptr_t)rcl32; + case const_rcl64: return (uintptr_t)rcl64; + case const_rcr16: return (uintptr_t)rcr16; + case const_rcr32: return (uintptr_t)rcr32; + case const_rcr64: return (uintptr_t)rcr64; + case const_div64: return (uintptr_t)div64; + case const_idiv64: return (uintptr_t)idiv64; + case const_random32: return (uintptr_t)get_random32; + case const_random64: return (uintptr_t)get_random64; + case const_readtsc: return (uintptr_t)ReadTSC; + case const_helper_getcpu: return (uintptr_t)helper_getcpu; + case const_cpuid: return (uintptr_t)my_cpuid; + case const_getsegmentbase: return (uintptr_t)GetSegmentBaseEmu; + case const_reset_fpu: return (uintptr_t)reset_fpu; + case const_sha1nexte: return (uintptr_t)sha1nexte; + case const_sha1msg1: return (uintptr_t)sha1msg1; + case const_sha1msg2: return (uintptr_t)sha1msg2; + case const_sha1rnds4: return (uintptr_t)sha1rnds4; + case const_sha256msg1: return (uintptr_t)sha256msg1; + case const_sha256msg2: return (uintptr_t)sha256msg2; + case const_sha256rnds2: return (uintptr_t)sha256rnds2; + case const_fpu_loadenv: return (uintptr_t)fpu_loadenv; + case const_fpu_savenv: return (uintptr_t)fpu_savenv; + case const_fpu_fxsave32: return (uintptr_t)fpu_fxsave32; + case const_fpu_fxsave64: return (uintptr_t)fpu_fxsave64; + case const_fpu_fxrstor32: return (uintptr_t)fpu_fxrstor32; + case const_fpu_fxrstor64: return (uintptr_t)fpu_fxrstor64; + case const_fpu_xsave: return (uintptr_t)fpu_xsave; + case const_fpu_xrstor: return (uintptr_t)fpu_xrstor; + case const_fpu_fbld: return (uintptr_t)fpu_fbld; + case const_fpu_fbst: return (uintptr_t)fpu_fbst; + case const_updateflags: return (uintptr_t)UpdateFlags; + case const_sse42_compare_string_explicit_len: return (uintptr_t)sse42_compare_string_explicit_len; + case const_sse42_compare_string_implicit_len: return (uintptr_t)sse42_compare_string_implicit_len; + case const_x64test_step: return (uintptr_t)x64test_step; + case const_printtrace: return (uintptr_t)PrintTrace; + case const_epilog: return (uintptr_t)native_epilog; + case const_jmptbl32: return getJumpTable32(); + case const_jmptbl48: return getJumpTable48(); + case const_jmptbl64: return getJumpTable64(); + case const_context: return (uintptr_t)my_context; + case const_8b_m7_m6_m5_m4_m3_m2_m1_0: return (uintptr_t)&mask_shift8; + case const_8b_7_6_5_4_3_2_1_0: return (uintptr_t)&mask_string8; + case const_8b_15_14_13_12_11_10_9_8: return (uintptr_t)&mask_string16; + case const_4f_m1_1_m1_1: return (uintptr_t)&addsubps; + case const_4f_1_m1_1_m1: return (uintptr_t)&subaddps; + case const_2d_m1_1: return (uintptr_t)&addsubpd; + case const_2d_1_m1: return (uintptr_t)&subaddpd; + + case const_last: dynarec_log(LOG_NONE, "Warning, const last used\n"); + return 0; + } + dynarec_log(LOG_NONE, "Warning, Unknown const %d used\n", which); + return 0; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_consts.h b/src/dynarec/ppc64le/dynarec_ppc64le_consts.h new file mode 100644 index 0000000000..a449abb9a0 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_consts.h @@ -0,0 +1,114 @@ +#ifndef __DYNAREC_PPC64LE_CONSTS__ +#define __DYNAREC_PPC64LE_CONSTS__ +#include + +typedef enum ppc64le_consts_s { + const_none, + const_daa8, + const_das8, + const_aaa16, + const_aas16, + const_aam16, + const_aad16, + const_native_br, + const_native_ud, + const_native_priv, + const_native_gpf, + const_native_int3, + const_native_int, + const_native_div0, + const_native_frstor16, + const_native_fsave16, + const_native_fsave, + const_native_aesimc, + const_native_aesd, + const_native_aesd_y, + const_native_aesdlast, + const_native_aesdlast_y, + const_native_aese, + const_native_aese_y, + const_native_aeselast, + const_native_aeselast_y, + const_native_aeskeygenassist, + const_native_pclmul, + const_native_pclmul_x, + const_native_pclmul_y, + const_direct_f2xm1, + const_direct_fyl2x, + const_direct_fyl2xp1, + const_native_fxtract, + const_direct_ftan, + const_direct_fpatan, + const_direct_fcos, + const_direct_fsin, + const_native_fsincos, + const_direct_fscale, + const_native_fprem, + const_native_fprem1, + const_native_fld, + const_native_fstp, + const_native_frstor, + const_native_next, + const_int3, + const_x86syscall, + const_x64syscall, + const_x64syscall_linux, + const_rcl16, + const_rcl32, + const_rcl64, + const_rcr16, + const_rcr32, + const_rcr64, + const_div64, + const_idiv64, + const_random32, + const_random64, + const_readtsc, + const_helper_getcpu, + const_cpuid, + const_getsegmentbase, + const_reset_fpu, + const_sha1nexte, + const_sha1msg1, + const_sha1msg2, + const_sha1rnds4, + const_sha256msg1, + const_sha256msg2, + const_sha256rnds2, + const_fpu_loadenv, + const_fpu_savenv, + const_fpu_fxsave32, + const_fpu_fxsave64, + const_fpu_fxrstor32, + const_fpu_fxrstor64, + const_fpu_xsave, + const_fpu_xrstor, + const_fpu_fbld, + const_fpu_fbst, + const_updateflags, + const_sse42_compare_string_explicit_len, + const_sse42_compare_string_implicit_len, + const_x64test_step, + const_printtrace, + const_epilog, + const_jmptbl32, + const_jmptbl48, + const_jmptbl64, + const_context, + const_8b_m7_m6_m5_m4_m3_m2_m1_0, + const_8b_7_6_5_4_3_2_1_0, + const_8b_15_14_13_12_11_10_9_8, + const_4f_m1_1_m1_1, + const_2d_m1_1, + const_4f_1_m1_1_m1, + const_2d_1_m1, + + const_last +} ppc64le_consts_t; + +uintptr_t getConst(ppc64le_consts_t which); + +// temporary define... +#define const_PrintTrace const_printtrace + +#endif //__DYNAREC_PPC64LE_CONSTS__ diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_d8.c b/src/dynarec/ppc64le/dynarec_ppc64le_d8.c new file mode 100644 index 0000000000..786344218f --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_d8.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_D8(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_d9.c b/src/dynarec/ppc64le/dynarec_ppc64le_d9.c new file mode 100644 index 0000000000..9d252bdbf0 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_d9.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_D9(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_da.c b/src/dynarec/ppc64le/dynarec_ppc64le_da.c new file mode 100644 index 0000000000..d53e0b95aa --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_da.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DA(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_db.c b/src/dynarec/ppc64le/dynarec_ppc64le_db.c new file mode 100644 index 0000000000..16d31e165f --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_db.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DB(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_dc.c b/src/dynarec/ppc64le/dynarec_ppc64le_dc.c new file mode 100644 index 0000000000..d39e0ea396 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_dc.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DC(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_dd.c b/src/dynarec/ppc64le/dynarec_ppc64le_dd.c new file mode 100644 index 0000000000..c125a65e99 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_dd.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DD(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_de.c b/src/dynarec/ppc64le/dynarec_ppc64le_de.c new file mode 100644 index 0000000000..916b62a7ed --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_de.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DE(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_df.c b/src/dynarec/ppc64le/dynarec_ppc64le_df.c new file mode 100644 index 0000000000..3483ef7030 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_df.c @@ -0,0 +1,36 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "emu/x87emu_private.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_DF(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t nextop = F8; + + switch (nextop) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_f0.c b/src/dynarec/ppc64le/dynarec_ppc64le_f0.c new file mode 100644 index 0000000000..819338da13 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_f0.c @@ -0,0 +1,37 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_F0(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_f20f.c b/src/dynarec/ppc64le/dynarec_ppc64le_f20f.c new file mode 100644 index 0000000000..f69d5ba7f9 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_f20f.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_F20F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_f30f.c b/src/dynarec/ppc64le/dynarec_ppc64le_f30f.c new file mode 100644 index 0000000000..19b3075392 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_f30f.c @@ -0,0 +1,39 @@ +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "my_cpuid.h" +#include "emu/x87emu_private.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "../dynarec_helper.h" +#include "dynarec_ppc64le_functions.h" + +uintptr_t dynarec64_F30F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog) +{ + (void)ip; + (void)need_epilog; + + uint8_t opcode = F8; + + switch (opcode) { + default: + DEFAULT; + } + + return addr; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_functions.c b/src/dynarec/ppc64le/dynarec_ppc64le_functions.c new file mode 100644 index 0000000000..143b4424fa --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_functions.c @@ -0,0 +1,991 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "emu/x87emu_private.h" +#include "x64trace.h" +#include "signals.h" +#include "dynarec_native.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "custommem.h" +#include "bridge.h" +#include "gdbjit.h" +#include "perfmap.h" +#include "elfloader.h" + +#define XMM0 0 +#define X870 XMM0 + 16 +#define EMM0 XMM0 + 16 + +// Get a FPU scratch reg +int fpu_get_scratch(dynarec_ppc64le_t* dyn) +{ + return SCRATCH0 + dyn->v.fpu_scratch++; // return a VMX reg +} +// Reset scratch regs counter +void fpu_reset_scratch(dynarec_ppc64le_t* dyn) +{ + dyn->v.fpu_scratch = 0; + dyn->v.ymm_used = 0; + dyn->v.xmm_used = 0; +} +// Get a x87 double reg +int fpu_get_reg_x87(dynarec_ppc64le_t* dyn, int t, int n) +{ + int i = X870; + while (dyn->v.fpuused[i]) + ++i; + dyn->v.fpuused[i] = 1; + dyn->v.vmxcache[i].n = n; + dyn->v.vmxcache[i].t = t; + dyn->v.news |= (1 << i); + dyn->use_x87 = 1; + return i; +} +// Free a FPU double reg +void fpu_free_reg(dynarec_ppc64le_t* dyn, int reg) +{ + // TODO: check upper limit? + dyn->v.fpuused[reg] = 0; + if (dyn->v.vmxcache[reg].t != VMX_CACHE_ST_F && dyn->v.vmxcache[reg].t != VMX_CACHE_ST_D && dyn->v.vmxcache[reg].t != VMX_CACHE_ST_I64) + dyn->v.vmxcache[reg].v = 0; +} +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_ppc64le_t* dyn, int emm) +{ + int ret = EMM0 + emm; + dyn->v.fpuused[ret] = 1; + dyn->v.vmxcache[ret].t = VMX_CACHE_MM; + dyn->v.vmxcache[ret].n = emm; + dyn->v.news |= (1 << (ret)); + dyn->use_mmx = 1; + return ret; +} +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_ppc64le_t* dyn, int t, int xmm) +{ + int i; + i = XMM0 + xmm; + + dyn->v.fpuused[i] = 1; + dyn->v.vmxcache[i].t = t; + dyn->v.vmxcache[i].n = xmm; + dyn->v.news |= (1 << i); + dyn->use_xmm = 1; + return i; +} + +// Get an YMM quad reg +int fpu_get_reg_ymm(dynarec_ppc64le_t* dyn, int t, int ymm) +{ + int i; + i = XMM0 + ymm; + + dyn->v.fpuused[i] = 1; + dyn->v.vmxcache[i].t = t; + dyn->v.vmxcache[i].n = ymm; + dyn->v.news |= (1 << i); + dyn->use_ymm = 1; + return i; +} + +// Reset fpu regs counter +static void fpu_reset_reg_vmxcache(vmxcache_t* v) +{ + for (int i = 0; i < 24; ++i) { + v->fpuused[i] = 0; + v->vmxcache[i].v = 0; + } +} + +void fpu_reset_reg(dynarec_ppc64le_t* dyn) +{ + fpu_reset_reg_vmxcache(&dyn->v); +} + + +int vmxcache_no_i64(dynarec_ppc64le_t* dyn, int ninst, int st, int a) +{ + if (a == VMX_CACHE_ST_I64) { + vmxcache_promote_double(dyn, ninst, st); + return VMX_CACHE_ST_D; + } + return a; +} + +int vmxcache_get_st(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + if (dyn->insts[ninst].v.swapped) { + if (dyn->insts[ninst].v.combined1 == a) + a = dyn->insts[ninst].v.combined2; + else if (dyn->insts[ninst].v.combined2 == a) + a = dyn->insts[ninst].v.combined1; + } + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_D + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_I64) + && dyn->insts[ninst].v.vmxcache[i].n == a) + return dyn->insts[ninst].v.vmxcache[i].t; + // not in the cache yet, so will be fetched... + return VMX_CACHE_ST_D; +} + +int vmxcache_get_current_st(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + (void)ninst; + if (!dyn->insts) + return VMX_CACHE_ST_D; + for (int i = 0; i < 24; ++i) + if ((dyn->v.vmxcache[i].t == VMX_CACHE_ST_F + || dyn->v.vmxcache[i].t == VMX_CACHE_ST_D + || dyn->v.vmxcache[i].t == VMX_CACHE_ST_I64) + && dyn->v.vmxcache[i].n == a) + return dyn->v.vmxcache[i].t; + // not in the cache yet, so will be fetched... + return VMX_CACHE_ST_D; +} + +int vmxcache_get_st_f(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F + && dyn->insts[ninst].v.vmxcache[i].n == a) + return i; + return -1; +} + +int vmxcache_get_st_f_i64(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_I64 || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F) + && dyn->insts[ninst].v.vmxcache[i].n == a) + return i; + return -1; +} + +static int vmxcache_get_st_f_noback(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F + && dyn->insts[ninst].v.vmxcache[i].n == a) + return i; + return -1; +} + +static int vmxcache_get_st_f_i64_noback(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_I64 || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F) + && dyn->insts[ninst].v.vmxcache[i].n == a) + return i; + return -1; +} + +int vmxcache_get_current_st_f(dynarec_ppc64le_t* dyn, int a) +{ + for (int i = 0; i < 24; ++i) + if (dyn->v.vmxcache[i].t == VMX_CACHE_ST_F + && dyn->v.vmxcache[i].n == a) + return i; + return -1; +} + +int vmxcache_get_current_st_f_i64(dynarec_ppc64le_t* dyn, int a) +{ + for (int i = 0; i < 24; ++i) + if ((dyn->v.vmxcache[i].t == VMX_CACHE_ST_I64 || dyn->v.vmxcache[i].t == VMX_CACHE_ST_F) + && dyn->v.vmxcache[i].n == a) + return i; + return -1; +} + +static void vmxcache_promote_double_forward(dynarec_ppc64le_t* dyn, int ninst, int maxinst, int a); +static void vmxcache_promote_double_internal(dynarec_ppc64le_t* dyn, int ninst, int maxinst, int a); +static void vmxcache_promote_double_combined(dynarec_ppc64le_t* dyn, int ninst, int maxinst, int a) +{ + if (a == dyn->insts[ninst].v.combined1 || a == dyn->insts[ninst].v.combined2) { + if (a == dyn->insts[ninst].v.combined1) { + a = dyn->insts[ninst].v.combined2; + } else + a = dyn->insts[ninst].v.combined1; + int i = vmxcache_get_st_f_i64_noback(dyn, ninst, a); + if (i >= 0) { + dyn->insts[ninst].v.vmxcache[i].t = VMX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + if (!dyn->insts[ninst].v.barrier) + vmxcache_promote_double_internal(dyn, ninst - 1, maxinst, a - dyn->insts[ninst].v.stack_push); + // go forward if combined is not pop'd + if (a - dyn->insts[ninst].v.stack_pop >= 0) + if (!dyn->insts[ninst + 1].v.barrier) + vmxcache_promote_double_forward(dyn, ninst + 1, maxinst, a - dyn->insts[ninst].v.stack_pop); + } + } +} +static void vmxcache_promote_double_internal(dynarec_ppc64le_t* dyn, int ninst, int maxinst, int a) +{ + if (dyn->insts[ninst + 1].v.barrier) + return; + while (ninst >= 0) { + a += dyn->insts[ninst].v.stack_pop; // adjust Stack depth: add pop'd ST (going backward) + int i = vmxcache_get_st_f_i64(dyn, ninst, a); + if (i < 0) return; + dyn->insts[ninst].v.vmxcache[i].t = VMX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if (dyn->insts[ninst].v.combined1 || dyn->insts[ninst].v.combined2) { + if (dyn->insts[ninst].v.swapped) { + if (a == dyn->insts[ninst].v.combined1) + a = dyn->insts[ninst].v.combined2; + else if (a == dyn->insts[ninst].v.combined2) + a = dyn->insts[ninst].v.combined1; + } else { + vmxcache_promote_double_combined(dyn, ninst, maxinst, a); + } + } + a -= dyn->insts[ninst].v.stack_push; // adjust Stack depth: remove push'd ST (going backward) + --ninst; + if (ninst < 0 || a < 0 || dyn->insts[ninst].v.barrier) + return; + } +} + +static void vmxcache_promote_double_forward(dynarec_ppc64le_t* dyn, int ninst, int maxinst, int a) +{ + while ((ninst != -1) && (ninst < maxinst) && (a >= 0)) { + a += dyn->insts[ninst].v.stack_push; // adjust Stack depth: add push'd ST (going forward) + if ((dyn->insts[ninst].v.combined1 || dyn->insts[ninst].v.combined2) && dyn->insts[ninst].v.swapped) { + if (a == dyn->insts[ninst].v.combined1) + a = dyn->insts[ninst].v.combined2; + else if (a == dyn->insts[ninst].v.combined2) + a = dyn->insts[ninst].v.combined1; + } + int i = vmxcache_get_st_f_i64_noback(dyn, ninst, a); + if (i < 0) return; + dyn->insts[ninst].v.vmxcache[i].t = VMX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if ((dyn->insts[ninst].v.combined1 || dyn->insts[ninst].v.combined2) && !dyn->insts[ninst].v.swapped) { + vmxcache_promote_double_combined(dyn, ninst, maxinst, a); + } + a -= dyn->insts[ninst].v.stack_pop; // adjust Stack depth: remove pop'd ST (going forward) + if (dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].v.barrier) + ++ninst; + else + ninst = -1; + } + if (ninst == maxinst) + vmxcache_promote_double(dyn, ninst, a); +} + +void vmxcache_promote_double(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + int i = vmxcache_get_current_st_f_i64(dyn, a); + if (i < 0) return; + dyn->v.vmxcache[i].t = VMX_CACHE_ST_D; + dyn->insts[ninst].v.vmxcache[i].t = VMX_CACHE_ST_D; + if (dyn->insts[ninst].x87precision) dyn->need_x87check = 2; + // check combined propagation too + if (dyn->v.combined1 || dyn->v.combined2) { + if (dyn->v.swapped) { + if (dyn->v.combined1 == a) + a = dyn->v.combined2; + else if (dyn->v.combined2 == a) + a = dyn->v.combined1; + } else { + if (dyn->v.combined1 == a) + vmxcache_promote_double(dyn, ninst, dyn->v.combined2); + else if (dyn->v.combined2 == a) + vmxcache_promote_double(dyn, ninst, dyn->v.combined1); + } + } + a -= dyn->insts[ninst].v.stack_push; // adjust Stack depth: remove push'd ST (going backward) + if (!ninst || a < 0) return; + vmxcache_promote_double_internal(dyn, ninst - 1, ninst, a); +} + +int vmxcache_combine_st(dynarec_ppc64le_t* dyn, int ninst, int a, int b) +{ + dyn->v.combined1 = a; + dyn->v.combined2 = b; + if (vmxcache_get_current_st(dyn, ninst, a) == VMX_CACHE_ST_F + && vmxcache_get_current_st(dyn, ninst, b) == VMX_CACHE_ST_F) + return VMX_CACHE_ST_F; + return VMX_CACHE_ST_D; +} + +static int isCacheEmpty(dynarec_native_t* dyn, int ninst) +{ + if (dyn->insts[ninst].v.stack_next) { + return 0; + } + for (int i = 0; i < 24; ++i) + if (dyn->insts[ninst].v.vmxcache[i].v) { // there is something at ninst for i + if (!( + (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_D + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_I64) + && dyn->insts[ninst].v.vmxcache[i].n < dyn->insts[ninst].v.stack_pop)) + return 0; + } + return 1; +} + +int fpuCacheNeedsTransform(dynarec_ppc64le_t* dyn, int ninst) +{ + int i2 = dyn->insts[ninst].x64.jmp_insts; + if (i2 < 0) + return 1; + if ((dyn->insts[i2].x64.barrier & BARRIER_FLOAT)) + // if the barrier has already been applied, no transform needed + return ((dyn->insts[ninst].x64.barrier & BARRIER_FLOAT)) ? 0 : (isCacheEmpty(dyn, ninst) ? 0 : 1); + int ret = 0; + if (!i2) { // just purge + if (dyn->insts[ninst].v.stack_next) { + return 1; + } + for (int i = 0; i < 24 && !ret; ++i) + if (dyn->insts[ninst].v.vmxcache[i].v) { // there is something at ninst for i + if (!( + (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_F + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_D + || dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_ST_I64) + && dyn->insts[ninst].v.vmxcache[i].n < dyn->insts[ninst].v.stack_pop)) + ret = 1; + } + return ret; + } + // Check if ninst can be compatible to i2 + if (dyn->insts[ninst].v.stack_next != dyn->insts[i2].v.stack - dyn->insts[i2].v.stack_push) { + return 1; + } + vmxcache_t cache_i2 = dyn->insts[i2].v; + vmxcacheUnwind(&cache_i2); + + for (int i = 0; i < 24; ++i) { + if (dyn->insts[ninst].v.vmxcache[i].v) { // there is something at ninst for i + if (!cache_i2.vmxcache[i].v) { // but there is nothing at i2 for i + ret = 1; + } else if (dyn->insts[ninst].v.vmxcache[i].v != cache_i2.vmxcache[i].v) { // there is something different + if (dyn->insts[ninst].v.vmxcache[i].n != cache_i2.vmxcache[i].n) { // not the same x64 reg + ret = 1; + } else if (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_XMMR && cache_i2.vmxcache[i].t == VMX_CACHE_XMMW) { /* nothing */ + } else if (dyn->insts[ninst].v.vmxcache[i].t == VMX_CACHE_YMMR && cache_i2.vmxcache[i].t == VMX_CACHE_YMMW) { /* nothing */ + } else + ret = 1; + } + } else if (cache_i2.vmxcache[i].v) + ret = 1; + } + return ret; +} + +void vmxcacheUnwind(vmxcache_t* cache) +{ + if (cache->swapped) { + // unswap + int a = -1; + int b = -1; + for (int j = 0; j < 24 && ((a == -1) || (b == -1)); ++j) + if ((cache->vmxcache[j].t == VMX_CACHE_ST_D || cache->vmxcache[j].t == VMX_CACHE_ST_F || cache->vmxcache[j].t == VMX_CACHE_ST_I64)) { + if (cache->vmxcache[j].n == cache->combined1) + a = j; + else if (cache->vmxcache[j].n == cache->combined2) + b = j; + } + if (a != -1 && b != -1) { + int tmp = cache->vmxcache[a].n; + cache->vmxcache[a].n = cache->vmxcache[b].n; + cache->vmxcache[b].n = tmp; + } + cache->swapped = 0; + cache->combined1 = cache->combined2 = 0; + } + if (cache->news) { + // remove the newly created vmxcache + for (int i = 0; i < 24; ++i) + if (cache->news & (1 << i)) + cache->vmxcache[i].v = 0; + cache->news = 0; + } + if (cache->stack_push) { + // unpush + for (int j = 0; j < 24; ++j) { + if ((cache->vmxcache[j].t == VMX_CACHE_ST_D || cache->vmxcache[j].t == VMX_CACHE_ST_F || cache->vmxcache[j].t == VMX_CACHE_ST_I64)) { + if (cache->vmxcache[j].n < cache->stack_push) + cache->vmxcache[j].v = 0; + else + cache->vmxcache[j].n -= cache->stack_push; + } + } + cache->x87stack -= cache->stack_push; + cache->tags >>= (cache->stack_push * 2); + cache->stack -= cache->stack_push; + if (cache->pushed >= cache->stack_push) + cache->pushed -= cache->stack_push; + else + cache->pushed = 0; + cache->stack_push = 0; + } + cache->x87stack += cache->stack_pop; + cache->stack_next = cache->stack; + if (cache->stack_pop) { + if (cache->poped >= cache->stack_pop) + cache->poped -= cache->stack_pop; + else + cache->poped = 0; + cache->tags <<= (cache->stack_pop * 2); + } + cache->stack_pop = 0; + cache->barrier = 0; + // And now, rebuild the x87cache info with vmxcache + cache->mmxcount = 0; + cache->fpu_scratch = 0; + for (int i = 0; i < 8; ++i) { + cache->x87cache[i] = -1; + cache->mmxcache[i] = -1; + cache->x87reg[i] = 0; + cache->ssecache[i * 2].v = -1; + cache->ssecache[i * 2 + 1].v = -1; + cache->avxcache[i * 2].v = -1; + cache->avxcache[i * 2 + 1].v = -1; + } + int x87reg = 0; + for (int i = 0; i < 24; ++i) { + if (cache->vmxcache[i].v) { + cache->fpuused[i] = 1; + switch (cache->vmxcache[i].t) { + case VMX_CACHE_MM: + cache->mmxcache[cache->vmxcache[i].n] = i; + ++cache->mmxcount; + break; + case VMX_CACHE_XMMR: + case VMX_CACHE_XMMW: + cache->ssecache[cache->vmxcache[i].n].reg = i; + cache->ssecache[cache->vmxcache[i].n].write = (cache->vmxcache[i].t == VMX_CACHE_XMMW) ? 1 : 0; + break; + case VMX_CACHE_YMMR: + case VMX_CACHE_YMMW: + cache->avxcache[cache->vmxcache[i].n].reg = i; + cache->avxcache[cache->vmxcache[i].n].write = (cache->vmxcache[i].t == VMX_CACHE_YMMW) ? 1 : 0; + break; + case VMX_CACHE_ST_F: + case VMX_CACHE_ST_D: + case VMX_CACHE_ST_I64: + cache->x87cache[x87reg] = cache->vmxcache[i].n; + cache->x87reg[x87reg] = i; + ++x87reg; + break; + case VMX_CACHE_SCR: + cache->fpuused[i] = 0; + cache->vmxcache[i].v = 0; + break; + } + } else { + cache->fpuused[i] = 0; + } + } +} + +const char* getCacheName(int t, int n) +{ + static char buff[20]; + switch (t) { + case VMX_CACHE_ST_D: sprintf(buff, "ST%d", n); break; + case VMX_CACHE_ST_F: sprintf(buff, "st%d", n); break; + case VMX_CACHE_ST_I64: sprintf(buff, "STi%d", n); break; + case VMX_CACHE_MM: sprintf(buff, "MM%d", n); break; + case VMX_CACHE_XMMW: sprintf(buff, "XMM%d", n); break; + case VMX_CACHE_XMMR: sprintf(buff, "xmm%d", n); break; + case VMX_CACHE_YMMW: sprintf(buff, "YMM%d", n); break; + case VMX_CACHE_YMMR: sprintf(buff, "ymm%d", n); break; + case VMX_CACHE_SCR: sprintf(buff, "Scratch"); break; + case VMX_CACHE_NONE: buff[0] = '\0'; break; + } + return buff; +} + +// PPC64LE register mapping for debug output +// x86 regs are mapped to PPC64LE callee-saved GPRs r14-r29 +static register_mapping_t register_mappings[] = { + { "rax", "r14" }, + { "eax", "r14" }, + { "ax", "r14" }, + { "ah", "r14" }, + { "al", "r14" }, + { "rcx", "r15" }, + { "ecx", "r15" }, + { "cx", "r15" }, + { "ch", "r15" }, + { "cl", "r15" }, + { "rdx", "r16" }, + { "edx", "r16" }, + { "dx", "r16" }, + { "dh", "r16" }, + { "dl", "r16" }, + { "rbx", "r17" }, + { "ebx", "r17" }, + { "bx", "r17" }, + { "bh", "r17" }, + { "bl", "r17" }, + { "rsi", "r20" }, + { "esi", "r20" }, + { "si", "r20" }, + { "sil", "r20" }, + { "rdi", "r21" }, + { "edi", "r21" }, + { "di", "r21" }, + { "dil", "r21" }, + { "rsp", "r18" }, + { "esp", "r18" }, + { "sp", "r18" }, + { "spl", "r18" }, + { "rbp", "r19" }, + { "ebp", "r19" }, + { "bp", "r19" }, + { "bpl", "r19" }, + { "r8", "r22" }, + { "r8d", "r22" }, + { "r8w", "r22" }, + { "r8b", "r22" }, + { "r9", "r23" }, + { "r9d", "r23" }, + { "r9w", "r23" }, + { "r9b", "r23" }, + { "r10", "r24" }, + { "r10d", "r24" }, + { "r10w", "r24" }, + { "r10b", "r24" }, + { "r11", "r25" }, + { "r11d", "r25" }, + { "r11w", "r25" }, + { "r11b", "r25" }, + { "r12", "r26" }, + { "r12d", "r26" }, + { "r12w", "r26" }, + { "r12b", "r26" }, + { "r13", "r27" }, + { "r13d", "r27" }, + { "r13w", "r27" }, + { "r13b", "r27" }, + { "r14", "r28" }, + { "r14d", "r28" }, + { "r14w", "r28" }, + { "r14b", "r28" }, + { "r15", "r29" }, + { "r15d", "r29" }, + { "r15w", "r29" }, + { "r15b", "r29" }, + { "rip", "r9" }, +}; + +// PPC64LE VMX register names for debug output +// vr0-vr31 (= vs32-vs63) +static const char* Vt[] = { "vr0", "vr1", "vr2", "vr3", "vr4", "vr5", "vr6", "vr7", "vr8", "vr9", "vr10", "vr11", "vr12", "vr13", "vr14", "vr15", "vr16", "vr17", "vr18", "vr19", "vr20", "vr21", "vr22", "vr23", "vr24", "vr25", "vr26", "vr27", "vr28", "vr29", "vr30", "vr31" }; + +static const char* df_status[] = { "unknown", "set", "none_pending", "none" }; + +void printf_x64_instruction(dynarec_native_t* dyn, zydis_dec_t* dec, instruction_x64_t* inst, const char* name); +void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex) +{ + if (!dyn->need_dump && !BOX64ENV(dynarec_gdbjit) && !BOX64ENV(dynarec_perf_map)) return; + + static char buf[4096]; + int length = sprintf(buf, "barrier=%d state=%d/%s(%s->%s), set=%X/%X, use=%X, need=%X/%X, fuse=%d/%d, sm=%d(%d/%d)", + dyn->insts[ninst].x64.barrier, + dyn->insts[ninst].x64.state_flags, + df_status[dyn->f], + df_status[dyn->insts[ninst].f_entry], + df_status[dyn->insts[ninst].f_exit], + dyn->insts[ninst].x64.set_flags, + dyn->insts[ninst].x64.gen_flags, + dyn->insts[ninst].x64.use_flags, + dyn->insts[ninst].x64.need_before, + dyn->insts[ninst].x64.need_after, + dyn->insts[ninst].nat_flags_fusion, + dyn->insts[ninst].no_scratch_usage, + dyn->smwrite, dyn->insts[ninst].will_write, dyn->insts[ninst].last_write); + if (dyn->insts[ninst].pred_sz) { + length += sprintf(buf + length, ", pred="); + for (int ii = 0; ii < dyn->insts[ninst].pred_sz; ++ii) + length += sprintf(buf + length, "%s%d", ii ? "/" : "", dyn->insts[ninst].pred[ii]); + } + if (dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts >= 0) + length += sprintf(buf + length, ", jmp=%d", dyn->insts[ninst].x64.jmp_insts); + if (dyn->insts[ninst].x64.jmp && dyn->insts[ninst].x64.jmp_insts == -1) + length += sprintf(buf + length, ", jmp=out"); + if (dyn->last_ip) + length += sprintf(buf + length, ", last_ip=%p", (void*)dyn->last_ip); + for (int ii = 0; ii < 24; ++ii) { + switch (dyn->insts[ninst].v.vmxcache[ii].t) { + case VMX_CACHE_ST_D: length += sprintf(buf + length, " V%d:%s", ii, getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_ST_F: length += sprintf(buf + length, " V%d:%s", ii, getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_ST_I64: length += sprintf(buf + length, " V%d:%s", ii, getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_MM: length += sprintf(buf + length, " %s:%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_XMMW: length += sprintf(buf + length, " %s:%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_XMMR: length += sprintf(buf + length, " %s:%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_YMMW: length += sprintf(buf + length, " %s:%s%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n), dyn->insts[ninst].v.avxcache[dyn->insts[ninst].v.vmxcache[ii].n].zero_upper == 1 ? "-UZ" : ""); break; + case VMX_CACHE_YMMR: length += sprintf(buf + length, " %s:%s%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n), dyn->insts[ninst].v.avxcache[dyn->insts[ninst].v.vmxcache[ii].n].zero_upper == 1 ? "-UZ" : ""); break; + case VMX_CACHE_SCR: length += sprintf(buf + length, " %s:%s", Vt[ii], getCacheName(dyn->insts[ninst].v.vmxcache[ii].t, dyn->insts[ninst].v.vmxcache[ii].n)); break; + case VMX_CACHE_NONE: + default: break; + } + } + if (dyn->v.stack || dyn->insts[ninst].v.stack_next || dyn->insts[ninst].v.x87stack) + length += sprintf(buf + length, " X87:%d/%d(+%d/-%d)%d", dyn->v.stack, dyn->insts[ninst].v.stack_next, dyn->insts[ninst].v.stack_push, dyn->insts[ninst].v.stack_pop, dyn->insts[ninst].v.x87stack); + if (dyn->insts[ninst].v.combined1 || dyn->insts[ninst].v.combined2) + length += sprintf(buf + length, " %s:%d/%d", dyn->insts[ninst].v.swapped ? "SWP" : "CMB", dyn->insts[ninst].v.combined1, dyn->insts[ninst].v.combined2); + + if (dyn->need_dump) { + printf_x64_instruction(dyn, rex.is32bits ? my_context->dec32 : my_context->dec, &dyn->insts[ninst].x64, name); + dynarec_log(LOG_NONE, "%s%p: %d emitted opcodes, inst=%d, %s%s\n", + (dyn->need_dump > 1) ? "\e[32m" : "", + (void*)(dyn->native_start + dyn->insts[ninst].address), dyn->insts[ninst].size / 4, ninst, buf, (dyn->need_dump > 1) ? "\e[m" : ""); + } + if (BOX64ENV(dynarec_gdbjit)) { + static char buf2[512]; + if (BOX64ENV(dynarec_gdbjit) > 1) { + snprintf(buf2, sizeof(buf2), "; %d: %d opcodes, %s", ninst, dyn->insts[ninst].size / 4, buf); + dyn->gdbjit_block = GdbJITBlockAddLine(dyn->gdbjit_block, (dyn->native_start + dyn->insts[ninst].address), buf2); + } + zydis_dec_t* dec = rex.is32bits ? my_context->dec32 : my_context->dec; + const char* inst_name = name; + if (dec) { + inst_name = DecodeX64Trace(dec, dyn->insts[ninst].x64.addr, 0); + x64disas_add_register_mapping_annotations(buf2, inst_name, register_mappings, sizeof(register_mappings) / sizeof(register_mappings[0])); + inst_name = buf2; + } + dyn->gdbjit_block = GdbJITBlockAddLine(dyn->gdbjit_block, (dyn->native_start + dyn->insts[ninst].address), inst_name); + } + if (BOX64ENV(dynarec_perf_map) && BOX64ENV(dynarec_perf_map_fd) != -1) { + writePerfMap(dyn->insts[ninst].x64.addr, dyn->native_start + dyn->insts[ninst].address, dyn->insts[ninst].size / 4, name); + } + if (length > (int)sizeof(buf)) printf_log(LOG_NONE, "Warning: buf too small in inst_name_pass3 (%d vs %zd)\n", length, sizeof(buf)); +} + +void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode) +{ + dynarec_log_prefix(0, LOG_NONE, "\t%08x\t%s\n", opcode, ppc64le_print(opcode, (uintptr_t)dyn->block)); +} + +static void x87_reset(vmxcache_t* v) +{ + for (int i = 0; i < 8; ++i) + v->x87cache[i] = -1; + v->tags = 0; + v->x87stack = 0; + v->stack = 0; + v->stack_next = 0; + v->stack_pop = 0; + v->stack_push = 0; + v->combined1 = v->combined2 = 0; + v->swapped = 0; + v->barrier = 0; + v->pushed = 0; + v->poped = 0; + + for (int i = 0; i < 24; ++i) + if (v->vmxcache[i].t == VMX_CACHE_ST_F + || v->vmxcache[i].t == VMX_CACHE_ST_D + || v->vmxcache[i].t == VMX_CACHE_ST_I64) + v->vmxcache[i].v = 0; +} + +static void mmx_reset(vmxcache_t* v) +{ + v->mmxcount = 0; + for (int i = 0; i < 8; ++i) + v->mmxcache[i] = -1; +} + +static void sse_reset(vmxcache_t* v) +{ + for (int i = 0; i < 16; ++i) + v->ssecache[i].v = -1; +} +static void avx_reset(vmxcache_t* v) +{ + for (int i = 0; i < 16; ++i) + v->avxcache[i].v = -1; +} + +void fpu_reset(dynarec_ppc64le_t* dyn) +{ + x87_reset(&dyn->v); + mmx_reset(&dyn->v); + sse_reset(&dyn->v); + avx_reset(&dyn->v); + fpu_reset_reg(dyn); +} + +int fpu_is_st_freed(dynarec_ppc64le_t* dyn, int ninst, int st) +{ + return (dyn->v.tags & (0b11 << (st * 2))) ? 1 : 0; +} + + +void fpu_reset_ninst(dynarec_ppc64le_t* dyn, int ninst) +{ + // TODO: x87 and mmx + sse_reset(&dyn->insts[ninst].v); + avx_reset(&dyn->insts[ninst].v); + fpu_reset_reg_vmxcache(&dyn->insts[ninst].v); +} + +void fpu_save_and_unwind(dynarec_ppc64le_t* dyn, int ninst, vmxcache_t* cache) +{ + memcpy(cache, &dyn->insts[ninst].v, sizeof(vmxcache_t)); + vmxcacheUnwind(&dyn->insts[ninst].v); +} +void fpu_unwind_restore(dynarec_ppc64le_t* dyn, int ninst, vmxcache_t* cache) +{ + memcpy(&dyn->insts[ninst].v, cache, sizeof(vmxcache_t)); +} + +void updateNativeFlags(dynarec_ppc64le_t* dyn) +{ + if (!BOX64ENV(dynarec_nativeflags)) + return; + for (int i = 1; i < dyn->size; ++i) + if (dyn->insts[i].nat_flags_fusion) { + int j = i - 1; + int found = 0; + if (dyn->insts[i].pred_sz == 1 && dyn->insts[i].pred[0] == j) { + while (j >= 0) { + if (dyn->insts[j].x64.set_flags && (dyn->insts[i].x64.use_flags & dyn->insts[j].x64.set_flags) == dyn->insts[i].x64.use_flags) { + dyn->insts[j].nat_flags_fusion = 1; + if (dyn->insts[i].x64.use_flags & X_SF) { + dyn->insts[j].nat_flags_needsign = 1; + } + dyn->insts[i].x64.use_flags = 0; + dyn->insts[j].nat_next_inst = i; + found = 1; + break; + } else if (j && dyn->insts[j].pred_sz == 1 && dyn->insts[j].pred[0] == j - 1 + && dyn->insts[j].no_scratch_usage && !dyn->insts[j].x64.set_flags && !dyn->insts[j].x64.use_flags) { + j -= 1; + } else + break; + } + } + if (!found) dyn->insts[i].nat_flags_fusion = 0; + } +} + +void get_free_scratch(dynarec_ppc64le_t* dyn, int ninst, uint8_t* tmp1, uint8_t* tmp2, uint8_t* tmp3, uint8_t s1, uint8_t s2, uint8_t s3, uint8_t s4, uint8_t s5) +{ + uint8_t n1 = dyn->insts[ninst].nat_flags_op1; + uint8_t n2 = dyn->insts[ninst].nat_flags_op2; + uint8_t tmp[5] = { 0 }; + int idx = 0; +#define GO(s) \ + if ((s != n1) && (s != n2)) tmp[idx++] = s + GO(s1); + GO(s2); + GO(s3); + GO(s4); + GO(s5); +#undef GO + *tmp1 = tmp[0]; + *tmp2 = tmp[1]; + *tmp3 = tmp[2]; +} + +void tryEarlyFpuBarrier(dynarec_ppc64le_t* dyn, int last_fpu_used, int ninst) +{ + // there is a barrier at ninst + // check if, up to last fpu_used, if there is some suspicious jump that would prevent the barrier to be put earlier + int usefull = 0; + for (int i = ninst - 1; i > last_fpu_used; --i) { + if (!dyn->insts[i].x64.has_next) + return; // break of chain, don't try to be smart for now + if (dyn->insts[i].x64.barrier & BARRIER_FLOAT) + return; // already done? + if (dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts == -1) + usefull = 1; + if (dyn->insts[i].x64.jmp && dyn->insts[i].x64.jmp_insts != -1) { + int i2 = dyn->insts[i].x64.jmp_insts; + if (i2 < last_fpu_used || i2 > ninst) { + // check if some xmm/ymm/x87 stack are used in landing point + if (i2 > ninst) { + if (dyn->insts[i2].v.xmm_used || dyn->insts[i2].v.ymm_used || dyn->insts[i2].v.stack) + return; + } + // we will stop there, not trying to guess too much thing + if ((usefull && (i + 1) != ninst)) { + if (BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log) > 1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i + 1, ninst); + dyn->insts[i + 1].x64.barrier |= BARRIER_FLOAT; + } + return; + } + usefull = 1; + } + for (int pred = 0; pred < dyn->insts[i].pred_sz; ++pred) { + if (dyn->insts[i].pred[pred] <= last_fpu_used) { + if (usefull && ((i + 1) != ninst)) { + if (BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log) > 1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", i + 1, ninst); + dyn->insts[i + 1].x64.barrier |= BARRIER_FLOAT; + } + return; + } + } + if (dyn->insts[i].pred_sz > 1) + usefull = 1; + } + if (usefull) { + if (BOX64ENV(dynarec_dump) || BOX64ENV(dynarec_log) > 1) dynarec_log(LOG_NONE, "Putting early Float Barrier in %d for %d\n", last_fpu_used, ninst); + dyn->insts[last_fpu_used + 1].x64.barrier |= BARRIER_FLOAT; + } +} + +void propagateFpuBarrier(dynarec_ppc64le_t* dyn) +{ + if (!dyn->use_x87) + return; + int last_fpu_used = -1; + for (int ninst = 0; ninst < dyn->size; ++ninst) { + int fpu_used = dyn->insts[ninst].v.xmm_used || dyn->insts[ninst].v.ymm_used || dyn->insts[ninst].mmx_used || dyn->insts[ninst].x87_used; + if (fpu_used) last_fpu_used = ninst; + dyn->insts[ninst].fpu_used = fpu_used; + if (dyn->insts[ninst].fpupurge && (last_fpu_used != -1) && (last_fpu_used != (ninst - 1))) { + tryEarlyFpuBarrier(dyn, last_fpu_used, ninst); + last_fpu_used = -1; // reset the last_fpu_used... + } + } +} + +void updateYmm0s(dynarec_ppc64le_t* dyn, int ninst, int max_ninst_reached) +{ + if (!dyn->use_ymm) + return; + int can_incr = ninst == max_ninst_reached; // Are we the top-level call? + int ok = 1; + while ((can_incr || ok) && ninst < dyn->size) { + uint16_t new_purge_ymm, new_ymm0_in, new_ymm0_out; + + if (dyn->insts[ninst].pred_sz && dyn->insts[ninst].x64.alive) { + uint16_t ymm0_union = 0; + uint16_t ymm0_inter = (ninst && !(dyn->insts[ninst].x64.barrier & BARRIER_FLOAT)) ? ((uint16_t)-1) : (uint16_t)0; + for (int i = 0; i < dyn->insts[ninst].pred_sz; ++i) { + int pred = dyn->insts[ninst].pred[i]; + if (pred >= max_ninst_reached) { + continue; + } + + int pred_out = dyn->insts[pred].x64.has_callret ? 0 : dyn->insts[pred].ymm0_out; + ymm0_union |= pred_out; + ymm0_inter &= pred_out; + } + new_purge_ymm = ymm0_union & ~ymm0_inter; + new_ymm0_in = ymm0_inter; + new_ymm0_out = (ymm0_inter | dyn->insts[ninst].ymm0_add) & ~dyn->insts[ninst].ymm0_sub; + + if ((dyn->insts[ninst].purge_ymm != new_purge_ymm) || (dyn->insts[ninst].ymm0_in != new_ymm0_in) || (dyn->insts[ninst].ymm0_out != new_ymm0_out)) { + dyn->insts[ninst].purge_ymm = new_purge_ymm; + dyn->insts[ninst].ymm0_in = new_ymm0_in; + dyn->insts[ninst].ymm0_out = new_ymm0_out; + + if (can_incr) { + ++max_ninst_reached; + } else { + ok = (max_ninst_reached - 1 != ninst) && dyn->insts[ninst].x64.has_next && !dyn->insts[ninst].x64.has_callret; + } + + int jmp = (dyn->insts[ninst].x64.jmp) ? dyn->insts[ninst].x64.jmp_insts : -1; + if ((jmp != -1) && (jmp < max_ninst_reached)) { + updateYmm0s(dyn, jmp, max_ninst_reached); + } + } else { + if (can_incr) { + ++max_ninst_reached; + + int jmp = (dyn->insts[ninst].x64.jmp) ? dyn->insts[ninst].x64.jmp_insts : -1; + if ((jmp != -1) && (jmp < max_ninst_reached)) { + updateYmm0s(dyn, jmp, max_ninst_reached); + } + } else { + ok = 0; + } + } + } else if (can_incr) { + ++max_ninst_reached; + } else { + ok = 0; + } + ++ninst; + } +} + +/* + * Fast word-at-a-time hash for block change detection on PPC64LE. + * Replaces the software CRC-32C (slicing-by-4 with 4KB lookup tables) + * which was consuming ~18.5% of CPU due to cache pollution. + * + * This processes 8 bytes per iteration using simple multiply-xorshift + * mixing. Empirical measurements show block sizes are heavily right-skewed: + * median ~24 bytes, mean ~65 bytes, p90 ~105-165 bytes (measured across + * ioquake3, UT99, bash, and unit tests with DYNAREC_HIST). Most blocks + * need only ~3 iterations, with no lookup table cache pressure. + * + * The hash is used purely for equality-based change detection (not for + * hash table distribution), so cryptographic quality is not needed. + */ +uint32_t ppc64le_fast_hash(void* p, uint32_t len) +{ + const uint8_t* data = (const uint8_t*)p; + /* Primes chosen for good bit mixing on multiply */ + uint64_t h = len * UINT64_C(0x9E3779B97F4A7C15); /* golden ratio * 2^64 */ + + /* Process 8 bytes at a time */ + while (len >= 8) { + uint64_t k; + __builtin_memcpy(&k, data, 8); + k *= UINT64_C(0xBF58476D1CE4E5B9); + k ^= k >> 31; + h ^= k; + h *= UINT64_C(0x94D049BB133111EB); + data += 8; + len -= 8; + } + + /* Process remaining bytes */ + if (len) { + uint64_t k = 0; + /* Safe: len is 1..7 here */ + switch(len) { + case 7: k |= (uint64_t)data[6] << 48; /* fall through */ + case 6: k |= (uint64_t)data[5] << 40; /* fall through */ + case 5: k |= (uint64_t)data[4] << 32; /* fall through */ + case 4: k |= (uint64_t)data[3] << 24; /* fall through */ + case 3: k |= (uint64_t)data[2] << 16; /* fall through */ + case 2: k |= (uint64_t)data[1] << 8; /* fall through */ + case 1: k |= (uint64_t)data[0]; + } + k *= UINT64_C(0xBF58476D1CE4E5B9); + k ^= k >> 31; + h ^= k; + h *= UINT64_C(0x94D049BB133111EB); + } + + /* Final mix — fold 64 bits down to 32 */ + h ^= h >> 33; + h *= UINT64_C(0xFF51AFD7ED558CCD); + h ^= h >> 33; + + return (uint32_t)h; +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_functions.h b/src/dynarec/ppc64le/dynarec_ppc64le_functions.h new file mode 100644 index 0000000000..bc5f8868b5 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_functions.h @@ -0,0 +1,109 @@ +#ifndef __DYNAREC_PPC64LE_FUNCTIONS_H__ +#define __DYNAREC_PPC64LE_FUNCTIONS_H__ + +#include "../dynarec_native_functions.h" + +// First VMX scratch register index +#define SCRATCH0 24 + +// Map cache index to VSX hardware register number (vs0-vs63). +// XMM indices 0-15 → vs32-vs47 (VMX vr0-vr15), enabling VMX integer SIMD ops. +// MMX indices 16-23 → vs56-vs63 (VMX vr24-vr31), enabling VMX integer SIMD ops. +// Scratch indices 24-31 → vs48-vs55 (VMX vr16-vr23), enabling VMX ops on scratch. +// +// The scratch mapping is critical: VMX (VX/VA-form) instructions encode a 5-bit +// VR number (vr0-vr31 = vs32-vs63). Placing scratch in vs48-vs55 (vr16-vr23) +// allows VRREG() to produce the correct VR number for both XMM and scratch regs. +// Note: vr20-vr23 (vs52-vs55) are callee-saved per ELFv2 ABI. We save vr20 in +// the prolog since the maximum simultaneous scratch count is 5 (indices 24-28). +// +// x87 and MMX are mutually exclusive (EMMS separates them), but on PPC64LE their +// hardware register requirements differ: x87 needs FPR (scalar FP) while MMX needs +// VR (VMX integer SIMD). Since FPR and VR are NOT aliased on POWER (unlike LA64), +// VSXREG routes MMX indices 16-23 to VR space (vs56-vs63), while VSXREG_X87 routes +// them to FPR space (vs16-vs23). x87 code must use VSXREG_X87 instead of VSXREG. +// vr24-vr31 (vs56-vs63) are callee-saved; the prolog/epilog saves/restores them. +#define VSXREG(idx) ((idx) < 16 ? (idx) + 32 : (idx) < 24 ? (idx) + 40 : (idx) + 24) +#define VSXREG_X87(idx) (idx) /* x87 idx 16-23 → vs16-vs23 (FPR f16-f23) */ + +// Extract VR register number for VMX integer instructions (5-bit, vr0-vr31). +// XMM cache indices 0-15 → vr0-vr15 (vs32-vs47). +// MMX cache indices 16-23 → vr24-vr31 (vs56-vs63). +// Scratch cache indices 24-31 → vr16-vr23 (vs48-vs55). +#define VRREG(idx) ((idx) < 16 ? (idx) : (idx) < 24 ? (idx) + 8 : (idx) - 8) + +typedef struct x64emu_s x64emu_t; +typedef struct dynarec_ppc64le_s dynarec_ppc64le_t; + +// Get an FPU scratch reg +int fpu_get_scratch(dynarec_ppc64le_t* dyn); +// Reset scratch regs counter +void fpu_reset_scratch(dynarec_ppc64le_t* dyn); +// Get an x87 double reg +int fpu_get_reg_x87(dynarec_ppc64le_t* dyn, int t, int n); +// Get an XMM quad reg +int fpu_get_reg_xmm(dynarec_ppc64le_t* dyn, int t, int xmm); +// Get an YMM quad reg +int fpu_get_reg_ymm(dynarec_ppc64le_t* dyn, int t, int ymm); +// Free a FPU/MMX/XMM reg +void fpu_free_reg(dynarec_ppc64le_t* dyn, int reg); +// Reset fpu regs counter +void fpu_reset_reg(dynarec_ppc64le_t* dyn); +// Get an MMX double reg +int fpu_get_reg_emm(dynarec_ppc64le_t* dyn, int emm); + +// ---- VMX cache functions +// Get type for STx +int vmxcache_get_st(dynarec_ppc64le_t* dyn, int ninst, int a); +// Get if STx is FLOAT or DOUBLE +int vmxcache_get_st_f(dynarec_ppc64le_t* dyn, int ninst, int a); +// Get if STx is FLOAT or I64 +int vmxcache_get_st_f_i64(dynarec_ppc64le_t* dyn, int ninst, int a); +// Get actual type for STx +int vmxcache_get_current_st(dynarec_ppc64le_t* dyn, int ninst, int a); +// Get actual STx is FLOAT or DOUBLE +int vmxcache_get_current_st_f(dynarec_ppc64le_t* dyn, int a); +// Get actual STx is FLOAT or I64 +int vmxcache_get_current_st_f_i64(dynarec_ppc64le_t* dyn, int a); +// Back-propagate a change float->double +void vmxcache_promote_double(dynarec_ppc64le_t* dyn, int ninst, int a); +// Combine and propagate if needed (pass 1 only) +int vmxcache_combine_st(dynarec_ppc64le_t* dyn, int ninst, int a, int b); // with stack current dyn->v_stack* +// Do not allow i64 type +int vmxcache_no_i64(dynarec_ppc64le_t* dyn, int ninst, int st, int a); + +// FPU Cache transformation (for loops) // Specific, need to be written by backend +int fpuCacheNeedsTransform(dynarec_ppc64le_t* dyn, int ninst); + +// Undo the changes of a vmxcache to get the status before the instruction +void vmxcacheUnwind(vmxcache_t* cache); +void fpu_save_and_unwind(dynarec_ppc64le_t* dyn, int ninst, vmxcache_t* cache); +void fpu_unwind_restore(dynarec_ppc64le_t* dyn, int ninst, vmxcache_t* cache); + +const char* getCacheName(int t, int n); + +void inst_name_pass3(dynarec_native_t* dyn, int ninst, const char* name, rex_t rex); +void print_opcode(dynarec_native_t* dyn, int ninst, uint32_t opcode); + +// reset the cache +void fpu_reset(dynarec_native_t* dyn); +void fpu_reset_ninst(dynarec_native_t* dyn, int ninst); +// is st freed +int fpu_is_st_freed(dynarec_native_t* dyn, int ninst, int st); + +// propagate FPU_BARRIER to trigger it as soon as possible (avoiding fetching an FPU reg if it's unused) +void propagateFpuBarrier(dynarec_ppc64le_t* dyn); + +// propagate the unneeded flags on XMM/YMM regs (done between step 0 and step 1) +void updateYmm0s(dynarec_ppc64le_t* dyn, int ninst, int max_ninst_reached); + +// Update native flags fusion info +void updateNativeFlags(dynarec_ppc64le_t* dyn); + +// Get free scratch registers avoiding native flag operands +void get_free_scratch(dynarec_ppc64le_t* dyn, int ninst, uint8_t* tmp1, uint8_t* tmp2, uint8_t* tmp3, uint8_t s1, uint8_t s2, uint8_t s3, uint8_t s4, uint8_t s5); + +// Try to put FPU barrier earlier +void tryEarlyFpuBarrier(dynarec_ppc64le_t* dyn, int last_fpu_used, int ninst); + +#endif //__DYNAREC_PPC64LE_FUNCTIONS_H__ diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_helper.c b/src/dynarec/ppc64le/dynarec_ppc64le_helper.c new file mode 100644 index 0000000000..543672378a --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_helper.c @@ -0,0 +1,2495 @@ +#include +#include +#include +#include +#include +#include + +#include "debug.h" +#include "box64context.h" +#include "box64cpu.h" +#include "emu/x64emu_private.h" +#include "ppc64le_emitter.h" +#include "ppc64le_mapping.h" +#include "x64emu.h" +#include "box64stack.h" +#include "callback.h" +#include "emu/x64run_private.h" +#include "x64trace.h" +#include "dynarec_native.h" +#include "../dynablock_private.h" +#include "custommem.h" + +#include "ppc64le_printer.h" +#include "dynarec_ppc64le_private.h" +#include "dynarec_ppc64le_functions.h" +#include "../dynarec_helper.h" + +#define SCRATCH 31 + +/* setup r2 to address pointed by ED, also fixaddress is an optionnal delta in the range [-absmax, +absmax], with delta&mask==0 to be added to ed for LDR/STR + * + * ALIGNMENT CONTRACT (PPC64LE-specific): + * + * The i12 parameter controls displacement alignment validation: + * - i12 = 0: No inline displacement; always materialize address in register. + * - i12 = 1: Allow inline displacement if within range and 4-byte aligned (D-form: LD/STD). + * - i12 = DQ_ALIGN|1: Allow inline displacement if within range and 16-byte aligned (DQ-form: LXV/STXV). + * + * When DQ_ALIGN is set, align_mask becomes 15 (vs 3 for D-form). Any displacement + * that fails the alignment check is materialized in a register, and fixedaddress + * is set to 0. This prevents silent displacement truncation in DQ-form instructions + * — see ppc64le_emitter.h for the full explanation of the truncation hazard. + * + * IMPORTANT: If the caller will use *fixaddress with LXV/STXV, DQ_ALIGN MUST be + * set in i12. Forgetting this causes silent wrong-address loads (not crashes), + * which are extremely difficult to diagnose. + */ +uintptr_t geted(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MAYUSE(delta); + + if (l == LOCK_LOCK) { + dyn->insts[ninst].lock = 1; + } + + int lock = l ? ((l == LOCK_LOCK) ? 1 : 2) : 0; + if (lock == 2) *l = 0; + + if (rex.is32bits && rex.is67) + return geted16(dyn, addr, ninst, nextop, ed, hint, scratch, fixaddress, rex, i12); + + uint8_t ret = x2; + *fixaddress = 0; + if (hint > 0) ret = hint; + // DQ_ALIGN flag: require 16-byte alignment for fixedaddress (DQ-form LXV/STXV) + // Without it, require 4-byte alignment (D-form LD/STD) + int align_mask = (i12 & DQ_ALIGN) ? 15 : 3; + i12 &= ~DQ_ALIGN; // strip the flag, keep the i12 value (0, 1, or >1) + int maxval = 32767; // PPC64LE ADDI has 16-bit signed immediate + if (i12 > 1) + maxval -= i12; + int seg_done = 0; + MAYUSE(scratch); + if (!(nextop & 0xC0)) { + if ((nextop & 7) == 4) { + uint8_t sib = F8; + int sib_reg = ((sib >> 3) & 7) + (rex.x << 3); + int sib_reg2 = (sib & 0x7) + (rex.b << 3); + if ((sib & 0x7) == 5) { + int64_t tmp = F32S; + if (sib_reg != 4) { + if (tmp && ((tmp < -32768) || (tmp > maxval) || !i12 || (i12 && (tmp & align_mask)))) { + MOV64y(scratch, tmp); + ALSLy(ret, TO_NAT(sib_reg), scratch, sib >> 6); + SCRATCH_USAGE(1); + } else { + if (sib >> 6) { + SLLIy(ret, TO_NAT(sib_reg), (sib >> 6)); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } else { + ret = TO_NAT(sib_reg); + } + *fixaddress = tmp; + } + } else { + if (rex.seg && !(tmp && ((tmp < -32768) || (tmp > maxval) || !i12 || (i12 && (tmp & align_mask))))) { + grab_segdata(dyn, addr, ninst, ret, rex.seg); + seg_done = 1; + *fixaddress = tmp; + } else if (rex.seg && tmp >= -32768 && tmp < 32768) { + grab_segdata(dyn, addr, ninst, ret, rex.seg); + if (tmp) ADDI(ret, ret, tmp); + seg_done = 1; + } else + MOV64y(ret, tmp); + + switch (lock) { + case 1: addLockAddress(tmp); break; + case 2: + if (isLockAddress(tmp)) *l = 1; + break; + } + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } + } else { + if (sib_reg != 4) { + ALSLy(ret, TO_NAT(sib_reg), TO_NAT(sib_reg2), sib >> 6); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } else { + ret = TO_NAT(sib_reg2); + } + } + } else if ((nextop & 7) == 5) { + if (rex.is32bits) { + int tmp = F32S; + if (rex.seg && tmp >= -32768 && tmp < 32768) { + grab_segdata(dyn, addr, ninst, ret, rex.seg); + if (tmp) ADDI(ret, ret, tmp); + seg_done = 1; + } else + MOV32w(ret, tmp); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + if (!rex.seg) + switch (lock) { + case 1: + addLockAddress(tmp); + if (fixaddress) *fixaddress = tmp; + break; + case 2: + if (isLockAddress(tmp)) *l = 1; + break; + } + } else { + int64_t tmp = F32S64; + int64_t adj = dyn->last_ip ? ((addr + delta) - dyn->last_ip) : 0; + if (i12 && adj && (tmp + adj >= -32768) && (tmp + adj <= maxval) && !((tmp + adj) & align_mask)) { + ret = xRIP; + *fixaddress = tmp + adj; + } else if (i12 && (tmp >= -32768) && (tmp <= maxval) && !(tmp & align_mask)) { + GETIP(addr + delta, scratch); + ret = xRIP; + *fixaddress = tmp; + SCRATCH_USAGE(1); + } else if (adj && (tmp + adj >= -32768) && (tmp + adj <= maxval)) { + ADDI(ret, xRIP, tmp + adj); + } else if ((tmp >= -32768) && (tmp <= maxval)) { + GETIP(addr + delta, scratch); + ADDIy(ret, xRIP, tmp); + SCRATCH_USAGE(1); + } else if (tmp + addr + delta < 0x80000000LL && !dyn->need_reloc) { + MOV64y(ret, tmp + addr + delta); + } else { + if (adj) { + MOV64y(ret, tmp + adj); + } else { + MOV64y(ret, tmp); + GETIP(addr + delta, scratch); + SCRATCH_USAGE(1); + } + ADDy(ret, ret, xRIP); + } + if (!rex.seg) + switch (lock) { + case 1: + addLockAddress(addr + delta + tmp); + if (fixaddress) *fixaddress = addr + delta + tmp; + break; + case 2: + if (isLockAddress(addr + delta + tmp)) *l = 1; + break; + } + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } + } else { + ret = TO_NAT((nextop & 7) + (rex.b << 3)); + } + } else { + int64_t i64; + uint8_t sib = 0; + int sib_reg = 0; + if ((nextop & 7) == 4) { + sib = F8; + sib_reg = ((sib >> 3) & 7) + (rex.x << 3); + } + int sib_reg2 = (sib & 0x07) + (rex.b << 3); + if (nextop & 0x80) + i64 = F32S; + else + i64 = F8S; + if (i64 == 0 || ((i64 >= -32768) && (i64 <= maxval) && i12 && !(i64 & align_mask))) { + *fixaddress = i64; + if ((nextop & 7) == 4) { + if (sib_reg != 4) { + ALSLy(ret, TO_NAT(sib_reg), TO_NAT(sib_reg2), sib >> 6); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } else { + ret = TO_NAT(sib_reg2); + } + } else { + ret = TO_NAT((nextop & 0x07) + (rex.b << 3)); + } + } else { + if (i64 >= -32768 && i64 <= 32767) { + if ((nextop & 7) == 4) { + if (sib_reg != 4) { + ALSLy(scratch, TO_NAT(sib_reg), TO_NAT(sib_reg2), sib >> 6); + SCRATCH_USAGE(1); + } else { + scratch = TO_NAT(sib_reg2); + } + } else { + scratch = TO_NAT((nextop & 0x07) + (rex.b << 3)); + } + ADDIy(ret, scratch, i64); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + } else { + MOV64y(scratch, i64); + SCRATCH_USAGE(1); + if ((nextop & 7) == 4) { + if (sib_reg != 4) { + ADDy(scratch, scratch, TO_NAT(sib_reg2)); + ALSLy(ret, TO_NAT(sib_reg), scratch, sib >> 6); + } else { + PASS3(int tmp = TO_NAT(sib_reg2)); + ADDy(ret, tmp, scratch); + } + } else { + PASS3(int tmp = TO_NAT((nextop & 0x07) + (rex.b << 3))); + ADDy(ret, tmp, scratch); + } + } + } + } + if (rex.is67 && IS_GPR(ret) && !rex.seg) { + ZEROUP2(hint, ret); // truncate for is67 case only (is32bits case regs are already 32bits only) + ret = hint; + } + if (rex.seg && !seg_done) { + if (scratch == ret) + scratch = ret + 1; + SCRATCH_USAGE(1); + grab_segdata(dyn, addr, ninst, scratch, rex.seg); + // seg offset is 64bits, so no truncation here + ADDxREGy(hint, scratch, ret, hint); + ret = hint; + } + *ed = ret; + return addr; +} + +uintptr_t geted16(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int i12) +{ + MAYUSE(dyn); + MAYUSE(ninst); + + uint8_t ret = x2; + *fixaddress = 0; + if (hint > 0) ret = hint; + int align_mask16 = (i12 & DQ_ALIGN) ? 15 : 3; + i12 &= ~DQ_ALIGN; + MAYUSE(scratch); + uint32_t m = nextop & 0xC7; + uint32_t n = (m >> 6) & 3; + int64_t offset = 0; + if (!n && (m & 7) == 6) { + offset = F16S; + MOV32w(ret, offset); + } else { + switch (n) { + case 0: offset = 0; break; + case 1: offset = F8S; break; + case 2: offset = F16S; break; + } + if (i12 && offset && offset >= -32768 && offset <= 32767 && !(offset & align_mask16)) { + *fixaddress = offset; + offset = 0; + } + int reg; + switch (m & 7) { + case 0: // R_BX + R_SI + ADD(ret, xRBX, xRSI); + reg = ret; + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + break; + case 1: // R_BX + R_DI + ADD(ret, xRBX, xRDI); + reg = ret; + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + break; + case 2: // R_BP + R_SI + ADD(ret, xRBP, xRSI); + reg = ret; + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + break; + case 3: // R_BP + R_DI + ADD(ret, xRBP, xRDI); + reg = ret; + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + break; + case 4: // R_SI + reg = xRSI; + break; + case 5: // R_DI + reg = xRDI; + break; + case 6: // R_BP + reg = xRBP; + break; + case 7: // R_BX + reg = xRBX; + break; + } + BSTRPICK_D(ret, reg, 15, 0); + if (!IS_GPR(ret)) SCRATCH_USAGE(1); + if (offset) { + if (offset >= -32768 && offset < 32768) { + ADDI(ret, ret, offset); + } else { + MOV64x(scratch, offset); + ADD(ret, ret, scratch); + SCRATCH_USAGE(1); + } + BSTRPICK_D(ret, ret, 15, 0); + } + } + + if (rex.seg) { + if (scratch == ret) + scratch = ret + 1; + SCRATCH_USAGE(1); + grab_segdata(dyn, addr, ninst, scratch, rex.seg); + // seg offset is 64bits, so no truncation here + if (IS_GPR(ret)) { + ADD(hint, ret, scratch); + ret = hint; + } else + ADD(ret, ret, scratch); + } + *ed = ret; + return addr; +} + +// Load a 32-bit value into a register +// Optimizes for small values that fit in fewer instructions +// If zeroup is set, clears upper 32 bits when val is negative +void ppc64le_move32(dynarec_ppc64le_t* dyn, int ninst, int reg, int32_t val, int zeroup) +{ + uint32_t uval = (uint32_t)val; + uint16_t lo = uval & 0xFFFF; + uint16_t hi = (uval >> 16) & 0xFFFF; + + if (val == (int32_t)(int16_t)val) { + // Value fits in signed 16-bit: single LI + LI(reg, val & 0xFFFF); + } else if (lo == 0) { + // Lower 16 bits are zero: single LIS + LIS(reg, hi); + } else { + // General case: LIS + ORI + LIS(reg, hi); + ORI(reg, reg, lo); + } + if (zeroup && val < 0) ZEROUP(reg); +} + +// Load a 64-bit value into a register +// Optimizes for values that fit in fewer instructions +void ppc64le_move64(dynarec_ppc64le_t* dyn, int ninst, int reg, int64_t val) +{ + // If value fits in signed 32 bits, use move32 (LI sign-extends to 64-bit) + if (val == (int64_t)(int32_t)val) { + ppc64le_move32(dyn, ninst, reg, (int32_t)val, 0); + return; + } + + // Full 64-bit constant: build top-down + // Load upper 32 bits, shift into place, OR in lower 32 bits + uint64_t uval = (uint64_t)val; + uint16_t w0 = (uval >> 0) & 0xFFFF; // bits [ 0:15] + uint16_t w1 = (uval >> 16) & 0xFFFF; // bits [16:31] + + // Load upper 32 bits (bits [32:63]) as a sign-extended 32-bit value + int32_t upper32 = (int32_t)(uval >> 32); + ppc64le_move32(dyn, ninst, reg, upper32, 0); + + // Shift left by 32 to place into upper half + SLDI(reg, reg, 32); + + // OR in bits [16:31] + if (w1) { + ORIS(reg, reg, w1); + } + + // OR in bits [0:15] + if (w0) { + ORI(reg, reg, w0); + } +} + +// ======================================================================== +// Jump / Control Flow — Batch H3 +// ======================================================================== + +void jump_to_epilog(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst) +{ + MAYUSE(dyn); + MAYUSE(ip); + MAYUSE(ninst); + MESSAGE(LOG_DUMP, "Jump to epilog\n"); + + if (reg) { + if (reg != xRIP) { + MV(xRIP, reg); + } + } else { + GETIP_(ip, x2); + } + TABLE64C(x2, const_epilog); + SMEND(); + CHECK_DFNONE(0); + MTCTR(x2); + BCTR(); +} + +void jump_to_epilog_fast(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst) +{ + MAYUSE(dyn); + MAYUSE(ip); + MAYUSE(ninst); + MESSAGE(LOG_DUMP, "Jump to epilog\n"); + + if (reg) { + if (reg != xRIP) { + MV(xRIP, reg); + } + } else { + GETIP_(ip, x2); + } + // PPC64LE has no const_epilog_fast, use const_epilog as fallback + TABLE64C(x2, const_epilog); + SMEND(); + CHECK_DFNONE(0); + MTCTR(x2); + BCTR(); +} + +// indirect_lookup: resolve x86-64 address through the jump table hierarchy +// Returns the register containing the resolved native address +static int indirect_lookup(dynarec_ppc64le_t* dyn, int ninst, int is32bits, int s1, int s2) +{ + MAYUSE(dyn); + if (!is32bits) { + // Check if address is above 48-bit range + SRDI(s1, xRIP, 48); + BNEZ_safe(s1, (intptr_t)dyn->jmp_next - (intptr_t)dyn->block); + // Load level-2 table base + if (dyn->need_reloc) { + TABLE64C(s2, const_jmptbl48); + } else { + MOV64x(s2, getConst(const_jmptbl48)); + } + // Extract bits for level-2 index + BSTRPICK_D(s1, xRIP, JMPTABL_START2 + JMPTABL_SHIFT2 - 1, JMPTABL_START2); + SLDI(s1, s1, 3); + ADD(s2, s1, s2); + LD(s2, 0, s2); + } else { + TABLE64C(s2, const_jmptbl32); + } + // Level-1 lookup + BSTRPICK_D(s1, xRIP, JMPTABL_START1 + JMPTABL_SHIFT1 - 1, JMPTABL_START1); + SLDI(s1, s1, 3); + ADD(s2, s1, s2); + LD(s2, 0, s2); + // Level-0 lookup + BSTRPICK_D(s1, xRIP, JMPTABL_START0 + JMPTABL_SHIFT0 - 1, JMPTABL_START0); + SLDI(s1, s1, 3); + ADD(s2, s1, s2); + LD(s1, 0, s2); + return s1; +} + +void jump_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst, int is32bits) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MESSAGE(LOG_DUMP, "Jump to next\n"); + + if (is32bits) + ip &= 0xffffffffLL; + + // Move target address into xRIP BEFORE CHECK_DFNONE, because + // FORCE_DFNONE uses LI(x1, 0) which clobbers x1 — and ed is often x1. + int dest; + if (reg) { + if (reg != xRIP) { + MV(xRIP, reg); + } + CHECK_DFNONE(0); + NOTEST(x2); + dest = indirect_lookup(dyn, ninst, is32bits, x2, x3); + } else { + CHECK_DFNONE(0); + NOTEST(x2); + uintptr_t p = getJumpTableAddress64(ip); + MAYUSE(p); + GETIP_(ip, x3); + if (dyn->need_reloc) AddRelocTable64JmpTbl(dyn, ninst, ip, STEP); + TABLE64_(x3, p); + LD(x2, 0, x3); + dest = x2; + } + if (reg != x1) { + MV(x1, xRIP); + } + CLEARIP(); + SMEND(); + // Branch to resolved target: use BCTRL for call-ret, BCTR otherwise +#ifdef HAVE_TRACE + MTCTR(dest); + BCTRL(); +#else + MTCTR(dest); + if (dyn->insts[ninst].x64.has_callret) { + BCTRL(); + } else { + BCTR(); + } +#endif +} + +void ret_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int ninst, rex_t rex) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MESSAGE(LOG_DUMP, "Ret to next\n"); + CHECK_DFNONE(0); + MVz(x1, xRIP); + SMEND(); + if (BOX64DRENV(dynarec_callret)) { + // pop the actual return address from PPC64LE stack + LD(x4, 0, xSP); // native addr + LD(x6, 8, xSP); // x86 addr + ADDI(xSP, xSP, 16); // pop + // is it the right address? + CMPD(x6, xRIP); + BNE(3 * 4); + // yes — branch to cached native return address + MTCTR(x4); + BCTR(); + // not the correct return address, regular jump, but purge the stack first + // Load the saved frame SP from emu->xSPSave (r12 is no longer xSavedSP) + LD(xSP, offsetof(x64emu_t, xSPSave), xEmu); + } + NOTEST(x2); + int dest = indirect_lookup(dyn, ninst, rex.is32bits, x2, x3); + MTCTR(dest); + BCTR(); + CLEARIP(); +} + +void iret_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int ninst, int is32bits, int is64bits) +{ + MAYUSE(ninst); + MESSAGE(LOG_DUMP, "IRet to next\n"); + if (is64bits) { + POP1(xRIP); + POP1(x2); + POP1(xFlags); + } else { + POP1_32(xRIP); + POP1_32(x2); + POP1_32(xFlags); + } + + // Store CS segment + STH(x2, offsetof(x64emu_t, segs[_CS]), xEmu); + // clean EFLAGS + RESTORE_EFLAGS(x1); + MOV32w(x1, 0x3E7FD7); // also mask RF + AND(xFlags, xFlags, x1); + ORI(xFlags, xFlags, 0x2); + SPILL_EFLAGS(); + CHECK_DFNONE(0); + // POP RSP + if (is64bits) { + POP1(x3); // rsp + POP1(x2); // ss + } else { + POP1_32(x3); // rsp + POP1_32(x2); // ss + } + // POP SS + STH(x2, offsetof(x64emu_t, segs[_SS]), xEmu); + // set new RSP + MV(xRSP, x3); + // Ret.... + rex_t dummy = { 0 }; + dummy.is32bits = is32bits; + dummy.w = is64bits; + ret_to_next(dyn, ip, ninst, dummy); + CLEARIP(); +} + +// ======================================================================== +// C/Native Calls & Segments — Batch H4 +// ======================================================================== + +void call_c(dynarec_ppc64le_t* dyn, int ninst, ppc64le_consts_t fnc, int reg, int ret, int saveflags, int savereg, int arg1, int arg2, int arg3, int arg4, int arg5, int arg6) +{ + MAYUSE(fnc); + // Bug 4 fix: Cannot use CHECK_DFNONE(1) here because it expands to + // FORCE_DFNONE() which uses x1 (=r3=A0) as scratch. If the caller + // pre-loaded an arg into x1 before calling call_c, FORCE_DFNONE + // would clobber it. Use `reg` (x6=r8) as scratch instead — it's the + // dedicated function pointer register that gets overwritten by + // TABLE64C(reg, fnc) later anyway. + if (dyn->f == status_none_pending) { + LI(reg, 0); + STW(reg, offsetof(x64emu_t, df), xEmu); + dyn->f = status_none; + } + if (savereg == 0) + savereg = x87pc; + if (saveflags) { + RESTORE_EFLAGS(reg); + STD(xFlags, offsetof(x64emu_t, eflags), xEmu); + } + fpu_pushcache(dyn, ninst, reg, 0); + if (ret != -2) { + ADDI(xSP, xSP, -16); // PPC64LE stack needs to be 16byte aligned + STD(savereg, 0, xSP); + STORE_REG(RDI); + STORE_REG(RSI); + STORE_REG(RDX); + STORE_REG(RCX); + STORE_REG(R8); + STORE_REG(R9); + STORE_REG(RAX); + STORE_REG(RBX); + STORE_REG(RSP); + STORE_REG(RBP); + STD(xRIP, offsetof(x64emu_t, ip), xEmu); + } + TABLE64C(reg, fnc); + // Save function pointer to CTR immediately, before argument shuffling. + // On PPC64LE, scratch registers x1-x6 alias ABI argument registers A0-A5 + // (x1=r3=A0, ..., x6=r8=A5). The function pointer is loaded into reg + // (=x6=r8=A5), so any MV(A5, arg5) during argument shuffling would + // clobber it. Saving to CTR first avoids this, since CTR is unaffected + // by the MV (register-to-register move) instructions. + MTCTR(reg); + // Argument shuffling: reverse order to handle the "shift up by one" + // permutation (x1→A1, x2→A2, ...) without clobbering sources. + if (arg6) MV(A6, arg6); + if (arg5) MV(A5, arg5); + if (arg4) MV(A4, arg4); + if (arg3) MV(A3, arg3); + if (arg2) MV(A2, arg2); + if (arg1) MV(A1, arg1); + MV(A0, xEmu); + // ELFv2 ABI: r12 must be set to the function entry address + // for global entry point TOC setup. Recover from CTR. + MFCTR(12); + BCTRL(); + if (ret >= 0) { + MV(ret, A0); + } + if (ret != -2) { + LD(savereg, 0, xSP); + ADDI(xSP, xSP, 16); +#define GO(A) \ + if (ret != x##A) { LOAD_REG(A); } + GO(RDI); + GO(RSI); + GO(RDX); + GO(RCX); + GO(R8); + GO(R9); + GO(RAX); + GO(RBX); + GO(RSP); + GO(RBP); + if (ret != xRIP) + LD(xRIP, offsetof(x64emu_t, ip), xEmu); +#undef GO + } + + fpu_popcache(dyn, ninst, reg, 0); + if (saveflags) { + LD(xFlags, offsetof(x64emu_t, eflags), xEmu); + SPILL_EFLAGS(); + } + if (savereg != x87pc && dyn->need_x87check) + NATIVE_RESTORE_X87PC(); + // SET_NODF(); + dyn->last_ip = 0; +} + +void call_n(dynarec_ppc64le_t* dyn, int ninst, void* fnc, int w) +{ + MAYUSE(fnc); + CHECK_DFNONE(1); + fpu_pushcache(dyn, ninst, x3, 1); + // Save x86 regs that the native function might modify via re-entrant emulation. + // On PPC64LE, x86 regs are in callee-saved r14-r29, so the native function preserves + // them. We save SP/BP/BX to the emu struct for consistency with the interpreter. + STD(xRSP, offsetof(x64emu_t, regs[_SP]), xEmu); + STD(xRBP, offsetof(x64emu_t, regs[_BP]), xEmu); + STD(xRBX, offsetof(x64emu_t, regs[_BX]), xEmu); + // prepare regs for native call: copy x86 argument registers to PPC64LE ABI registers. + // On PPC64LE, x86 regs are in callee-saved r14-r29, while ABI args are in r3-r8. + // No overlap, so order doesn't matter. + MV(A0, xRDI); + MV(A1, xRSI); + MV(A2, xRDX); + MV(A3, xRCX); + MV(A4, xR8); + MV(A5, xR9); + // check if additional sextw needed (applied after copy to ABI regs) + int sextw_mask = ((w > 0 ? w : -w) >> 4) & 0b111111; + for (int i = 0; i < 6; i++) { + if (sextw_mask & (1 << i)) { + SEXT_W(A0 + i, A0 + i); + } + } + // native call — load function pointer into x7 (r10) to avoid clobbering argument regs + if (dyn->need_reloc) { + // fnc is indirect, to help with relocation (but PltResolver might be an issue here) + TABLE64(x7, (uintptr_t)fnc); + LD(x7, 0, x7); + } else { + TABLE64_(x7, *(uintptr_t*)fnc); + } + // ELFv2 ABI: r12 must be set to the function entry address + MR(12, x7); + MTCTR(x7); + BCTRL(); + // put return value in x64 regs + if (w > 0) { + MV(xRAX, A0); + MV(xRDX, A1); + } + // all done, restore all regs + LD(xRSP, offsetof(x64emu_t, regs[_SP]), xEmu); + LD(xRBP, offsetof(x64emu_t, regs[_BP]), xEmu); + LD(xRBX, offsetof(x64emu_t, regs[_BX]), xEmu); + fpu_popcache(dyn, ninst, x3, 1); + NATIVE_RESTORE_X87PC(); + // SET_NODF(); +} + +void grab_segdata(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, int reg, int segment) +{ + (void)addr; + int64_t j64; + MAYUSE(j64); + MESSAGE(LOG_DUMP, "Get %s Offset\n", (segment == _FS) ? "FS" : "GS"); + LD(reg, offsetof(x64emu_t, segs_offs[segment]), xEmu); + MESSAGE(LOG_DUMP, "----%s Offset\n", (segment == _FS) ? "FS" : "GS"); +} + +// ======================================================================== +// x87 Stack & Cache Management — Batch H5 +// ======================================================================== + +int x87_stackcount(dynarec_ppc64le_t* dyn, int ninst, int scratch) +{ + MAYUSE(scratch); + if (!dyn->v.x87stack) + return 0; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Stackcount (%d)\n", dyn->v.x87stack); + int a = dyn->v.x87stack; + // Add x87stack to emu fpu_stack + LWZ(scratch, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(scratch, scratch, a); + STW(scratch, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(scratch, offsetof(x64emu_t, top), xEmu); + ADDI(scratch, scratch, -a); + ANDId(scratch, scratch, 7); + STW(scratch, offsetof(x64emu_t, top), xEmu); + // reset x87stack, but not the stack count of extcache + dyn->v.x87stack = 0; + dyn->v.stack_next -= dyn->v.stack; + int ret = dyn->v.stack; + dyn->v.stack = 0; + MESSAGE(LOG_DUMP, "\t------x87 Stackcount\n"); + return ret; +} + +void x87_unstackcount(dynarec_ppc64le_t* dyn, int ninst, int scratch, int count) +{ + MAYUSE(scratch); + if (!count) + return; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, scratch); + MESSAGE(LOG_DUMP, "\tSynch x87 Unstackcount (%d)\n", count); + int a = -count; + // Add x87stack to emu fpu_stack + LWZ(scratch, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(scratch, scratch, a); + STW(scratch, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(scratch, offsetof(x64emu_t, top), xEmu); + ADDI(scratch, scratch, -a); + ANDId(scratch, scratch, 7); + STW(scratch, offsetof(x64emu_t, top), xEmu); + // reset x87stack, but not the stack count of extcache + dyn->v.x87stack = count; + dyn->v.stack = count; + dyn->v.stack_next += dyn->v.stack; + MESSAGE(LOG_DUMP, "\t------x87 Unstackcount\n"); +} + +void x87_forget(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st) +{ + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->v.x87cache[i] == st) + ret = i; + if (ret == -1) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tForget x87 Cache for ST%d\n", st); + const int reg = dyn->v.x87reg[ret]; +#if STEP == 1 + if (dyn->v.vmxcache[dyn->v.x87reg[ret]].t == VMX_CACHE_ST_F + || dyn->v.vmxcache[dyn->v.x87reg[ret]].t == VMX_CACHE_ST_I64) + vmxcache_promote_double(dyn, ninst, st); +#endif + // prepare offset to fpu => s1 + // Get top + LWZ(s2, offsetof(x64emu_t, top), xEmu); + // Update + int a = st - dyn->v.x87stack; + if (a) { + ADDI(s2, s2, a); + ANDId(s2, s2, 7); // (emu->top + i)&7 + } + SLDI(s2, s2, 3); + ADD(s1, xEmu, s2); + if (dyn->v.vmxcache[reg].t == VMX_CACHE_ST_F) { + FMR(SCRATCH0, reg); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); + } else if (dyn->v.vmxcache[reg].t == VMX_CACHE_ST_I64) { + FCFID(SCRATCH0, reg); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); + } else { + STFD(reg, offsetof(x64emu_t, x87), s1); + } + MESSAGE(LOG_DUMP, "\t--------x87 Cache for ST%d\n", st); + // and forget that cache + fpu_free_reg(dyn, dyn->v.x87reg[ret]); + dyn->v.vmxcache[reg].v = 0; + dyn->v.x87cache[ret] = -1; + dyn->v.x87reg[ret] = -1; +} + +void x87_reget_st(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st) +{ + dyn->insts[ninst].x87_used = 1; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + // search in cache first + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] == st) { + // refresh the value + MESSAGE(LOG_DUMP, "\tRefresh x87 Cache for ST%d\n", st); +#if STEP == 1 + if (dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_F + || dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_I64) + vmxcache_promote_double(dyn, ninst, st); +#endif + LWZ(s2, offsetof(x64emu_t, top), xEmu); + int a = st - dyn->v.x87stack; + if (a) { + ADDI(s2, s2, a); + ANDId(s2, s2, 7); + } + SLDI(s2, s2, 3); + ADD(s1, xEmu, s2); + LFD(dyn->v.x87reg[i], offsetof(x64emu_t, x87), s1); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + // ok + return; + } + // Was not in the cache? creating it.... + MESSAGE(LOG_DUMP, "\tCreate x87 Cache for ST%d\n", st); + // get a free spot + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->v.x87cache[i] == -1) + ret = i; + // found, setup and grab the value + dyn->v.x87cache[ret] = st; + dyn->v.x87reg[ret] = fpu_get_reg_x87(dyn, VMX_CACHE_ST_D, st); + LWZ(s2, offsetof(x64emu_t, top), xEmu); + int a = st - dyn->v.x87stack; + ADDI(s2, s2, a); + ANDId(s2, s2, 7); // (emu->top + i)&7 + SLDI(s2, s2, 3); + ADD(s1, xEmu, s2); + LFD(dyn->v.x87reg[ret], offsetof(x64emu_t, x87), s1); + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); +} + +void x87_free(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int st) +{ + dyn->insts[ninst].x87_used = 1; + int ret = -1; + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->v.x87cache[i] == st) + ret = i; + MESSAGE(LOG_DUMP, "\tFFREE%s x87 Cache for ST%d\n", (ret != -1) ? " (and Forget)" : "", st); + if (ret != -1) { + const int reg = dyn->v.x87reg[ret]; +#if STEP == 1 + if (dyn->v.vmxcache[reg].t == VMX_CACHE_ST_F || dyn->v.vmxcache[reg].t == VMX_CACHE_ST_I64) + vmxcache_promote_double(dyn, ninst, st); +#endif + // Get top + LWZ(s2, offsetof(x64emu_t, top), xEmu); + // Update + int ast = st - dyn->v.x87stack; + if (ast) { + ADDI(s2, s2, ast); + ANDId(s2, s2, 7); // (emu->top + i)&7 + } + SLDI(s2, s2, 3); + ADD(s1, xEmu, s2); + if (dyn->v.vmxcache[reg].t == VMX_CACHE_ST_F) { + FMR(SCRATCH0, reg); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); + } else if (dyn->v.vmxcache[reg].t == VMX_CACHE_ST_I64) { + FCFID(SCRATCH0, reg); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); + } else { + STFD(reg, offsetof(x64emu_t, x87), s1); + } + // and forget that cache + fpu_free_reg(dyn, reg); + dyn->v.vmxcache[reg].v = 0; + dyn->v.x87cache[ret] = -1; + dyn->v.x87reg[ret] = -1; + } else { + // Get top + LWZ(s2, offsetof(x64emu_t, top), xEmu); + // Update + int ast = st - dyn->v.x87stack; + if (ast) { + ADDI(s2, s2, ast); + ANDId(s2, s2, 7); // (emu->top + i)&7 + } + } + // add mark in the freed array + dyn->v.tags |= 0b11 << (st * 2); + MESSAGE(LOG_DUMP, "\t--------x87 FFREE for ST%d\n", st); +} + +void x87_swapreg(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int b) +{ + int i1, i2, i3; + i1 = x87_get_cache(dyn, ninst, 1, s1, s2, b, X87_ST(b)); + i2 = x87_get_cache(dyn, ninst, 1, s1, s2, a, X87_ST(a)); + i3 = dyn->v.x87cache[i1]; + dyn->v.x87cache[i1] = dyn->v.x87cache[i2]; + dyn->v.x87cache[i2] = i3; + // swap those too + int j1, j2, j3; + j1 = x87_get_vmxcache(dyn, ninst, s1, s2, b); + j2 = x87_get_vmxcache(dyn, ninst, s1, s2, a); + j3 = dyn->v.vmxcache[j1].n; + dyn->v.vmxcache[j1].n = dyn->v.vmxcache[j2].n; + dyn->v.vmxcache[j2].n = j3; + // mark as swapped + dyn->v.swapped = 1; + dyn->v.combined1 = a; + dyn->v.combined2 = b; +} + +// ======================================================================== +// Rounding Mode & Cache Coherency — Batch H6 +// ======================================================================== + +// Set rounding according to cw flags, return reg to restore flags +// PPC64LE FPSCR RN field is bits 62:63 in PPC big-endian notation (the 2 LSBs) +// x87 cw bits 10:11 encode rounding: 0=Nearest, 1=Down, 2=Up, 3=Chop +// PPC64LE FPSCR RN: 0=Nearest, 1=TowardZero, 2=TowardsPositive, 3=TowardsNegative +// Mapping: 0->0, 1->3, 2->2, 3->1 => negate & mask with 3 +int x87_setround(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MAYUSE(s1); + MAYUSE(s2); + // Save current FPSCR to s2 (returned for restoreround) + MFFS(SCRATCH0); + STFD(SCRATCH0, -8, xSP); + LD(s2, -8, xSP); + // Read x87 control word rounding bits [11:10] + LWZ(s1, offsetof(x64emu_t, cw), xEmu); + BSTRPICK_D(s1, s1, 11, 10); + // Remap: 0->0, 1->3, 2->2, 3->1 + NEG(s1, s1); + ANDId(s1, s1, 3); + // Build new FPSCR: (old & ~3) | new_rn + // Save new_rn to red zone, use s1 for combining + STD(s1, -16, xSP); + RLDICR(s1, s2, 0, 61); // s1 = old FPSCR with bottom 2 bits cleared + LD(s2, -16, xSP); // s2 = new_rn (temporarily) + OR(s1, s1, s2); // s1 = new FPSCR + // Reload old FPSCR back into s2 (the return value) + LD(s2, -8, xSP); + // Set new FPSCR + STD(s1, -8, xSP); + LFD(SCRATCH0, -8, xSP); + MTFSF(0xFF, SCRATCH0); + return s2; +} + +// Set rounding according to mxcsr flags, return reg to restore flags +int sse_setround(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MAYUSE(s1); + MAYUSE(s2); + // Save current FPSCR to s2 (returned for restoreround) + MFFS(SCRATCH0); + STFD(SCRATCH0, -8, xSP); + LD(s2, -8, xSP); + // Read mxcsr rounding bits [14:13] + LWZ(s1, offsetof(x64emu_t, mxcsr), xEmu); + BSTRPICK_D(s1, s1, 14, 13); + // Remap: 0->0, 1->3, 2->2, 3->1 + NEG(s1, s1); + ANDId(s1, s1, 3); + // Build new FPSCR: (old & ~3) | new_rn + STD(s1, -16, xSP); + RLDICR(s1, s2, 0, 61); // s1 = old FPSCR with bottom 2 bits cleared + LD(s2, -16, xSP); // s2 = new_rn (temporarily) + OR(s1, s1, s2); // s1 = new FPSCR + // Reload old FPSCR back into s2 (the return value) + LD(s2, -8, xSP); + // Set new FPSCR + STD(s1, -8, xSP); + LFD(SCRATCH0, -8, xSP); + MTFSF(0xFF, SCRATCH0); + return s2; +} + +int vmxcache_st_coherency(dynarec_ppc64le_t* dyn, int ninst, int a, int b) +{ + int i1 = vmxcache_get_st(dyn, ninst, a); + int i2 = vmxcache_get_st(dyn, ninst, b); + if (i1 != i2) { + MESSAGE(LOG_DUMP, "Warning, ST cache incoherent between ST%d(%d) and ST%d(%d)\n", a, i1, b, i2); + } + + return i1; +} + +// Restore round flag +void x87_restoreround(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + MAYUSE(dyn); + MAYUSE(ninst); + MAYUSE(s1); + // s1 contains the old FPSCR value saved by x87_setround/sse_setround + STD(s1, -8, xSP); + LFD(SCRATCH0, -8, xSP); + MTFSF(0xFF, SCRATCH0); +} + +// ======================================================================== +// x87 Push/Pop — Batch H7 +// ======================================================================== + +// On step 1, Float/Double for ST is actually computed and back-propagated +// On step 2-3, the value is just read for inst[...].v.vmxcache[..] +// the reg returned is *2 for FLOAT +int x87_do_push(dynarec_ppc64le_t* dyn, int ninst, int s1, int t) +{ + dyn->insts[ninst].x87_used = 1; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->v.x87stack += 1; + dyn->v.stack += 1; + dyn->v.stack_next += 1; + dyn->v.stack_push += 1; + ++dyn->v.pushed; + if (dyn->v.poped) + --dyn->v.poped; + // move all regs in cache, and find a free one + for (int j = 0; j < 32; ++j) + if ((dyn->v.vmxcache[j].t == VMX_CACHE_ST_D) + || (dyn->v.vmxcache[j].t == VMX_CACHE_ST_F) + || (dyn->v.vmxcache[j].t == VMX_CACHE_ST_I64)) + ++dyn->v.vmxcache[j].n; + int ret = -1; + dyn->v.tags <<= 2; + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) + ++dyn->v.x87cache[i]; + else if (ret == -1) { + dyn->v.x87cache[i] = 0; + ret = dyn->v.x87reg[i] = fpu_get_reg_x87(dyn, t, 0); + dyn->v.vmxcache[ret].t = X87_ST0; + } + if (ret == -1) { + MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n"); + dyn->abort = 1; + } + return ret; +} + +void x87_do_push_empty(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + dyn->insts[ninst].x87_used = 1; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + dyn->v.x87stack += 1; + dyn->v.stack += 1; + dyn->v.stack_next += 1; + dyn->v.stack_push += 1; + ++dyn->v.pushed; + if (dyn->v.poped) + --dyn->v.poped; + // move all regs in cache + for (int j = 0; j < 32; ++j) + if ((dyn->v.vmxcache[j].t == VMX_CACHE_ST_D) + || (dyn->v.vmxcache[j].t == VMX_CACHE_ST_F) + || (dyn->v.vmxcache[j].t == VMX_CACHE_ST_I64)) + ++dyn->v.vmxcache[j].n; + int ret = -1; + dyn->v.tags <<= 2; + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) + ++dyn->v.x87cache[i]; + else if (ret == -1) + ret = i; + if (ret == -1) { + MESSAGE(LOG_DUMP, "Incoherent x87 stack cache, aborting\n"); + dyn->abort = 1; + } +} + +static void internal_x87_dopop(dynarec_ppc64le_t* dyn) +{ + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) { + --dyn->v.x87cache[i]; + if (dyn->v.x87cache[i] == -1) { + fpu_free_reg(dyn, dyn->v.x87reg[i]); + dyn->v.x87reg[i] = -1; + } + } +} + +static int internal_x87_dofree(dynarec_ppc64le_t* dyn) +{ + if (dyn->v.tags & 0b11) { + MESSAGE(LOG_DUMP, "\t--------x87 FREED ST0, poping 1 more\n"); + return 1; + } + return 0; +} + +void x87_do_pop(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + dyn->insts[ninst].x87_used = 1; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + do { + dyn->v.x87stack -= 1; + dyn->v.stack_next -= 1; + dyn->v.stack_pop += 1; + dyn->v.tags >>= 2; + ++dyn->v.poped; + if (dyn->v.pushed) + --dyn->v.pushed; + // move all regs in cache, poping ST0 + internal_x87_dopop(dyn); + } while (internal_x87_dofree(dyn)); +} + +void x87_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1, int s2, int s3) +{ + dyn->insts[ninst].x87_used = 1; + int ret = 0; + for (int i = 0; i < 8 && !ret; ++i) + if (dyn->v.x87cache[i] != -1) + ret = 1; + if (!ret && !dyn->v.x87stack) // nothing to do + return; + MESSAGE(LOG_DUMP, "\tPurge %sx87 Cache and Synch Stackcount (%+d)---\n", next ? "locally " : "", dyn->v.x87stack); + int a = dyn->v.x87stack; + if (a != 0) { + // reset x87stack + if (!next) + dyn->v.x87stack = 0; + // Add x87stack to emu fpu_stack + LWZ(s2, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(s2, s2, a); + STW(s2, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(s2, offsetof(x64emu_t, top), xEmu); + ADDI(s2, s2, -a); + ANDId(s2, s2, 7); + STW(s2, offsetof(x64emu_t, top), xEmu); + // update tags + LHZ(s1, offsetof(x64emu_t, fpu_tags), xEmu); + if (a > 0) { + SLDI(s1, s1, a * 2); + } else { + MOV32w(s3, 0xffff0000); + OR(s1, s1, s3); + SRDI(s1, s1, -a * 2); + } + STH(s1, offsetof(x64emu_t, fpu_tags), xEmu); + } else { + LWZ(s2, offsetof(x64emu_t, top), xEmu); + } + // check if free is used + if (dyn->v.tags) { + LHZ(s1, offsetof(x64emu_t, fpu_tags), xEmu); + MOV32w(s3, dyn->v.tags); + OR(s1, s1, s3); + STH(s1, offsetof(x64emu_t, fpu_tags), xEmu); + } + if (ret != 0) { + // --- set values + // Get top + // loop all cache entries + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) { + int st = dyn->v.x87cache[i] + dyn->v.stack_pop; + // don't force promotion here + ADDI(s3, s2, dyn->v.x87cache[i]); // unadjusted count, as it's relative to real top + ANDId(s3, s3, 7); // (emu->top + st)&7 + SLDI(s1, s3, 3); + ADD(s1, xEmu, s1); + switch (vmxcache_get_current_st(dyn, ninst, st)) { + case VMX_CACHE_ST_D: + STFD(dyn->v.x87reg[i], offsetof(x64emu_t, x87), s1); // save the value + break; + case VMX_CACHE_ST_F: + // PPC64LE FPRs always hold doubles; ST_F value is already double + FMR(SCRATCH0, dyn->v.x87reg[i]); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); // save the value + break; + case VMX_CACHE_ST_I64: + FCFID(SCRATCH0, dyn->v.x87reg[i]); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); // save the value + break; + } + if (!next) { + fpu_free_reg(dyn, dyn->v.x87reg[i]); + dyn->v.x87reg[i] = -1; + dyn->v.x87cache[i] = -1; + // dyn->v.stack_pop+=1; //no pop, but the purge because of barrier will have the v.barrier flags set + } + } + } + if (!next) { + dyn->v.stack_next = 0; + dyn->v.tags = 0; +#if STEP < 2 + // refresh the cached valued, in case it's a purge outside a instruction + dyn->insts[ninst].v.barrier = 1; + dyn->v.pushed = 0; + dyn->v.poped = 0; + +#endif + } + MESSAGE(LOG_DUMP, "\t---Purge x87 Cache and Synch Stackcount\n"); +} + +void x87_reflectcount(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2) +{ + // Synch top and stack count + int a = dyn->v.x87stack; + if (a) { + MESSAGE(LOG_DUMP, "\tSync x87 Count of %d-----\n", a); + // Add x87stack to emu fpu_stack + LWZ(s2, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(s2, s2, a); + STW(s2, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(s2, offsetof(x64emu_t, top), xEmu); + ADDI(s2, s2, -a); + ANDId(s2, s2, 7); + STW(s2, offsetof(x64emu_t, top), xEmu); + // update tags + LHZ(s1, offsetof(x64emu_t, fpu_tags), xEmu); + if (a > 0) { + SLDI(s1, s1, a * 2); + } else { + MOV32w(s2, 0xffff0000); + OR(s1, s1, s2); + SRDI(s1, s1, -a * 2); + } + STH(s1, offsetof(x64emu_t, fpu_tags), xEmu); + MESSAGE(LOG_DUMP, "\t-----Sync x87 Count\n"); + } +} + +static void x87_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3) +{ + // Sync top and stack count + int a = dyn->v.x87stack; + if (a) { + // Add x87stack to emu fpu_stack + LWZ(s2, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(s2, s2, a); + STW(s2, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(s2, offsetof(x64emu_t, top), xEmu); + ADDI(s2, s2, -a); + ANDId(s2, s2, 7); + STW(s2, offsetof(x64emu_t, top), xEmu); + // update tags + LHZ(s1, offsetof(x64emu_t, fpu_tags), xEmu); + if (a > 0) { + SLDI(s1, s1, a * 2); + } else { + MOV32w(s3, 0xffff0000); + OR(s1, s1, s3); + SRDI(s1, s1, -a * 2); + } + STH(s1, offsetof(x64emu_t, fpu_tags), xEmu); + } + int ret = 0; + for (int i = 0; (i < 8) && (!ret); ++i) + if (dyn->v.x87cache[i] != -1) + ret = 1; + if (!ret) // nothing to do + return; + // prepare offset to fpu => s1 + // Get top + if (!a) { + LWZ(s2, offsetof(x64emu_t, top), xEmu); + } + // loop all cache entries + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) { + ADDI(s3, s2, dyn->v.x87cache[i]); + ANDId(s3, s3, 7); // (emu->top + i)&7 + SLDI(s1, s3, 3); + ADD(s1, xEmu, s1); + if (vmxcache_get_current_st_f(dyn, dyn->v.x87cache[i]) >= 0) { + // PPC64LE FPRs always hold doubles; ST_F is already double, just copy + FMR(SCRATCH0, dyn->v.x87reg[i]); + STFD(SCRATCH0, offsetof(x64emu_t, x87), s1); + } else + STFD(dyn->v.x87reg[i], offsetof(x64emu_t, x87), s1); + } +} + +void x87_unreflectcount(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2) +{ + // revert top and stack count + int a = dyn->v.x87stack; + if (a) { + // Sub x87stack to emu fpu_stack + LWZ(s2, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(s2, s2, -a); + STW(s2, offsetof(x64emu_t, fpu_stack), xEmu); + // Add x87stack to top, with and 7 + LWZ(s2, offsetof(x64emu_t, top), xEmu); + ADDI(s2, s2, a); + ANDId(s2, s2, 7); + STW(s2, offsetof(x64emu_t, top), xEmu); + // update tags + LHZ(s1, offsetof(x64emu_t, fpu_tags), xEmu); + if (a > 0) { + MOV32w(s2, 0xffff0000); + OR(s1, s1, s2); + SRDI(s1, s1, a * 2); + } else { + SLDI(s1, s1, -a * 2); + } + STH(s1, offsetof(x64emu_t, fpu_tags), xEmu); + } +} + +int x87_get_current_cache(dynarec_ppc64le_t* dyn, int ninst, int st, int t) +{ + dyn->insts[ninst].x87_used = 1; + // search in cache first + for (int i = 0; i < 8; ++i) { + if (dyn->v.x87cache[i] == st) { +#if STEP == 1 + if (t == VMX_CACHE_ST_D && (dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_F || dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_I64)) + vmxcache_promote_double(dyn, ninst, st); + else if (t == VMX_CACHE_ST_I64 && (dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_F)) + vmxcache_promote_double(dyn, ninst, st); + else if (t == VMX_CACHE_ST_F && (dyn->v.vmxcache[dyn->v.x87reg[i]].t == VMX_CACHE_ST_I64)) + vmxcache_promote_double(dyn, ninst, st); +#endif + return i; + } + assert(dyn->v.x87cache[i] < 8); + } + return -1; +} + +int x87_get_cache(dynarec_ppc64le_t* dyn, int ninst, int populate, int s1, int s2, int st, int t) +{ + dyn->insts[ninst].x87_used = 1; + if (dyn->v.mmxcount) + mmx_purgecache(dyn, ninst, 0, s1); + int ret = x87_get_current_cache(dyn, ninst, st, t); + if (ret != -1) + return ret; + MESSAGE(LOG_DUMP, "\tCreate %sx87 Cache for ST%d\n", populate ? "and populate " : "", st); + // get a free spot + for (int i = 0; (i < 8) && (ret == -1); ++i) + if (dyn->v.x87cache[i] == -1) + ret = i; + // found, setup and grab the value + dyn->v.x87cache[ret] = st; + dyn->v.x87reg[ret] = fpu_get_reg_x87(dyn, VMX_CACHE_ST_D, st); + if (populate) { + LWZ(s2, offsetof(x64emu_t, top), xEmu); + int a = st - dyn->v.x87stack; + if (a) { + ADDI(s2, s2, a); + ANDId(s2, s2, 7); + } + SLDI(s2, s2, 3); + ADD(s1, xEmu, s2); + LFD(dyn->v.x87reg[ret], offsetof(x64emu_t, x87), s1); + } + MESSAGE(LOG_DUMP, "\t-------x87 Cache for ST%d\n", st); + + return ret; +} + +int x87_get_vmxcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st) +{ + dyn->insts[ninst].x87_used = 1; + for (int ii = 0; ii < 32; ++ii) + if ((dyn->v.vmxcache[ii].t == VMX_CACHE_ST_F + || dyn->v.vmxcache[ii].t == VMX_CACHE_ST_D + || dyn->v.vmxcache[ii].t == VMX_CACHE_ST_I64) + && dyn->v.vmxcache[ii].n == st) + return ii; + dynarec_log(LOG_NONE, "Warning: x87_get_vmxcache didn't find cache for ninst=%d\n", ninst); + return -1; +} + +int x87_get_st(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + dyn->insts[ninst].x87_used = 1; + return dyn->v.x87reg[x87_get_cache(dyn, ninst, 1, s1, s2, a, t)]; +} + +int x87_get_st_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int t) +{ + dyn->insts[ninst].x87_used = 1; + return dyn->v.x87reg[x87_get_cache(dyn, ninst, 0, s1, s2, a, t)]; +} + +// MMX helpers +static int isx87Empty(dynarec_ppc64le_t* dyn) +{ + for (int i = 0; i < 8; ++i) + if (dyn->v.x87cache[i] != -1) + return 0; + return 1; +} + +// get vmx register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + dyn->insts[ninst].mmx_used = 1; + if (!dyn->v.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if (dyn->v.mmxcache[a] != -1) + return dyn->v.mmxcache[a]; + ++dyn->v.mmxcount; + int ret = dyn->v.mmxcache[a] = fpu_get_reg_emm(dyn, a); + // MMX lives in VR space (vr24-vr31); load 64-bit value via GPR then move to VR + LD(s1, offsetof(x64emu_t, mmx[a]), xEmu); + MTVSRDD(VSXREG(ret), xZR, s1); // high=0, low=data + return ret; +} + +// get vmx register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int a) +{ + dyn->insts[ninst].mmx_used = 1; + if (!dyn->v.x87stack && isx87Empty(dyn)) + x87_purgecache(dyn, ninst, 0, s1, s2, s3); + if (dyn->v.mmxcache[a] != -1) + return dyn->v.mmxcache[a]; + ++dyn->v.mmxcount; + int ret = dyn->v.mmxcache[a] = fpu_get_reg_emm(dyn, a); + return ret; +} + +// purge the MMX cache only +void mmx_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1) +{ + if (!dyn->v.mmxcount) return; + if (!next) dyn->v.mmxcount = 0; + int old = -1; + for (int i = 0; i < 8; ++i) + if (dyn->v.mmxcache[i] != -1) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sMMX Cache ------\n", next ? "locally " : ""); + ++old; + } + // MMX lives in VR space; store via GPR + MFVSRLD(s1, VSXREG(dyn->v.mmxcache[i])); + STD(s1, offsetof(x64emu_t, mmx[i]), xEmu); + if (!next) { + fpu_free_reg(dyn, dyn->v.mmxcache[i]); + dyn->v.mmxcache[i] = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge MMX Cache\n"); + } +} + +static void mmx_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + for (int i = 0; i < 8; ++i) + if (dyn->v.mmxcache[i] != -1) { + // MMX lives in VR space; store via GPR + MFVSRLD(s1, VSXREG(dyn->v.mmxcache[i])); + STD(s1, offsetof(x64emu_t, mmx[i]), xEmu); + } +} + +// SSE / SSE2 helpers +// get vmx register for a SSE reg, create the entry if needed +int sse_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int forwrite) +{ + dyn->v.xmm_used |= 1 << a; + if (dyn->v.ssecache[a].v != -1) { + if (forwrite) { + dyn->v.ssecache[a].write = 1; // update only if forwrite + dyn->v.vmxcache[dyn->v.ssecache[a].reg].t = VMX_CACHE_XMMW; + } + return dyn->v.ssecache[a].reg; + } + int need_vld = 1; + int avx_was_dirty = 0; + // migrate from avx to sse + if (dyn->v.avxcache[a].v != -1) { + avx_was_dirty = dyn->v.avxcache[a].write; + avx_reflect_reg_upper128(dyn, ninst, a, forwrite); + dyn->v.avxcache[a].v = -1; + need_vld = 0; + } + int is_write = forwrite || avx_was_dirty; + dyn->v.ssecache[a].reg = fpu_get_reg_xmm(dyn, is_write ? VMX_CACHE_XMMW : VMX_CACHE_XMMR, a); + int ret = dyn->v.ssecache[a].reg; + dyn->v.ssecache[a].write = is_write; + if(need_vld) LXV(VSXREG(ret), offsetof(x64emu_t, xmm[a]), xEmu); //skip load if migrate from avx + return ret; +} + +// get vmx register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int a) +{ + dyn->v.xmm_used |= 1 << a; + if (dyn->v.ssecache[a].v != -1) { + dyn->v.ssecache[a].write = 1; + dyn->v.vmxcache[dyn->v.ssecache[a].reg].t = VMX_CACHE_XMMW; + return dyn->v.ssecache[a].reg; + } + // migrate from avx to sse + if (dyn->v.avxcache[a].v != -1) { + avx_reflect_reg_upper128(dyn, ninst, a, 1); + dyn->v.avxcache[a].v = -1; + } + dyn->v.ssecache[a].reg = fpu_get_reg_xmm(dyn, VMX_CACHE_XMMW, a); + dyn->v.ssecache[a].write = 1; // it will be write... + return dyn->v.ssecache[a].reg; +} + +// forget ext register for a SSE reg, does nothing if the reg is not loaded +void sse_forget_reg(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + dyn->v.xmm_used |= 1 << a; + if (dyn->v.ssecache[a].v == -1) + return; + if (dyn->v.vmxcache[dyn->v.ssecache[a].reg].t == VMX_CACHE_XMMW) { + STXV(VSXREG(dyn->v.ssecache[a].reg), offsetof(x64emu_t, xmm[a]), xEmu); + } + fpu_free_reg(dyn, dyn->v.ssecache[a].reg); + dyn->v.ssecache[a].v = -1; + return; +} + +void sse_reflect_reg(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + dyn->v.xmm_used |= 1 << a; + if (dyn->v.ssecache[a].v == -1) + return; + if (dyn->v.vmxcache[dyn->v.ssecache[a].reg].t == VMX_CACHE_XMMW) { + STXV(VSXREG(dyn->v.ssecache[a].reg), offsetof(x64emu_t, xmm[a]), xEmu); + } +} + +// purge the SSE cache for XMM0..XMM7 (to use before function native call) +void sse_purge07cache(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + int old = -1; + for (int i = 0; i < 8; ++i) + if (dyn->v.ssecache[i].v != -1 || dyn->v.avxcache[i].v != -1) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge XMM0..7 Cache ------\n"); + ++old; + } + dyn->v.xmm_used |= 1 << i; + if (dyn->v.vmxcache[dyn->v.avxcache[i].reg].t == VMX_CACHE_YMMW) { + STXV(VSXREG(dyn->v.avxcache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + if(dyn->v.avxcache[i].zero_upper == 1){ + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + } else { + // upper 128 is already in ymm[i] memory, no need to extract + } + STXV(SCRATCH, offsetof(x64emu_t, ymm[i]), xEmu); + fpu_free_reg(dyn, dyn->v.avxcache[i].reg); + dyn->v.avxcache[i].v = -1; + } else if (dyn->v.vmxcache[dyn->v.ssecache[i].reg].t == VMX_CACHE_XMMW) { + STXV(VSXREG(dyn->v.ssecache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + fpu_free_reg(dyn, dyn->v.ssecache[i].reg); + dyn->v.ssecache[i].v = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge XMM0..7 Cache\n"); + } +} + +// purge the SSE cache only +static void sse_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1) +{ + int old = -1; + for (int i = 0; i < 16; ++i) + if (dyn->v.ssecache[i].v != -1) { + if (next) dyn->v.xmm_used |= (1 << i); + if (dyn->v.ssecache[i].write) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sSSE Cache ------\n", next ? "locally " : ""); + ++old; + } + MESSAGE(LOG_DUMP, "\t xmm[%d]: reg=%d, vsxreg=%d, write=%d, offset=0x%lx\n", + i, dyn->v.ssecache[i].reg, VSXREG(dyn->v.ssecache[i].reg), + dyn->v.ssecache[i].write, + (unsigned long)offsetof(x64emu_t, xmm[i])); + STXV(VSXREG(dyn->v.ssecache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + } + if (!next) { + fpu_free_reg(dyn, dyn->v.ssecache[i].reg); + dyn->v.ssecache[i].v = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge SSE Cache\n"); + } +} + +static void sse_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + for (int i = 0; i < 16; ++i) + if (dyn->v.ssecache[i].v != -1) { + dyn->v.xmm_used |= 1 << i; + if (dyn->v.ssecache[i].write) { + STXV(VSXREG(dyn->v.ssecache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + } + } +} + +// AVX helpers +// get vmx register for an AVX reg, create the entry if needed +int avx_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int forwrite, int width) +{ + dyn->v.ymm_used |= 1 << a; + if (dyn->v.avxcache[a].v != -1) { + if (forwrite) { + dyn->v.avxcache[a].write = 1; // update only if forwrite + dyn->v.vmxcache[dyn->v.avxcache[a].reg].t = VMX_CACHE_YMMW; + } + if (width == VMX_AVX_WIDTH_128) { + dyn->v.avxcache[a].width = VMX_AVX_WIDTH_128; + if(forwrite) dyn->v.avxcache[a].zero_upper = 1; + } else { + // if width changed to 256, and zero_upper==1, means need zero-fill upper 128bits now + if (dyn->v.avxcache[a].zero_upper == 1) { + dyn->v.avxcache[a].zero_upper = 0; + // store zeros to ymm[a] memory (upper 128 bits) + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + STXV(SCRATCH, offsetof(x64emu_t, ymm[a]), xEmu); + } + dyn->v.avxcache[a].width = VMX_AVX_WIDTH_256; + } + + return dyn->v.avxcache[a].reg; + } + + // migrate from sse to avx + if (dyn->v.ssecache[a].v != -1) { + // release SSE reg cache + fpu_free_reg(dyn, dyn->v.ssecache[a].reg); + dyn->v.ssecache[a].v = -1; + } + + // new reg + dyn->v.avxcache[a].v = 0; + dyn->v.avxcache[a].reg = fpu_get_reg_ymm(dyn, forwrite ? VMX_CACHE_YMMW : VMX_CACHE_YMMR, a); + int ret = dyn->v.avxcache[a].reg; + dyn->v.avxcache[a].write = forwrite; + dyn->v.avxcache[a].width = width; + if (width == VMX_AVX_WIDTH_128) { + if(forwrite) dyn->v.avxcache[a].zero_upper = 1; + LXV(VSXREG(ret), offsetof(x64emu_t, xmm[a]), xEmu); + } else { + // load lower 128 from xmm[a] + LXV(VSXREG(ret), offsetof(x64emu_t, xmm[a]), xEmu); + // upper 128 stays in ymm[a] memory — nothing to load into register + dyn->v.avxcache[a].zero_upper = 0; + } + return ret; +} + +int avx_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int width) +{ + dyn->v.ymm_used |= 1 << a; + if (dyn->v.avxcache[a].v != -1) { + dyn->v.avxcache[a].write = 1; + dyn->v.vmxcache[dyn->v.avxcache[a].reg].t = VMX_CACHE_YMMW; + if (width == VMX_AVX_WIDTH_128) { + dyn->v.avxcache[a].width = VMX_AVX_WIDTH_128; + dyn->v.avxcache[a].zero_upper = 1; + } else { + dyn->v.avxcache[a].width = VMX_AVX_WIDTH_256; + dyn->v.avxcache[a].zero_upper = 0; + } + return dyn->v.avxcache[a].reg; + } + // migrate from sse to avx + if (dyn->v.ssecache[a].v != -1) { + // Release SSE reg cache + fpu_free_reg(dyn, dyn->v.ssecache[a].reg); + dyn->v.ssecache[a].v = -1; + } + dyn->v.avxcache[a].v = 0; + dyn->v.avxcache[a].reg = fpu_get_reg_ymm(dyn, VMX_CACHE_YMMW, a); + dyn->v.avxcache[a].write = 1; + dyn->v.avxcache[a].width = width; + if (width == VMX_AVX_WIDTH_128){ + dyn->v.avxcache[a].zero_upper = 1; + } else { + dyn->v.avxcache[a].zero_upper = 0; + } + return dyn->v.avxcache[a].reg; +} + +void avx_reflect_reg_upper128(dynarec_ppc64le_t* dyn, int ninst, int a, int forwrite) +{ + dyn->v.ymm_used |= 1 << a; + if (dyn->v.avxcache[a].v == -1) + return; + // Write back upper 128 bits if the AVX register was dirty and zero_upper is set, + // regardless of forwrite (the SSE caller needs ymm[a] memory consistent) + if (dyn->v.vmxcache[dyn->v.avxcache[a].reg].t == VMX_CACHE_YMMW) { + if (dyn->v.avxcache[a].zero_upper == 1) { + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + STXV(SCRATCH, offsetof(x64emu_t, ymm[a]), xEmu); + } + // else: upper 128 is already in ymm[a] memory, no-op + } + dyn->v.avxcache[a].v = -1; + return; +} + +void avx_forget_reg(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + dyn->v.ymm_used |= 1 << a; + if (dyn->v.avxcache[a].v == -1) + return; + if (dyn->v.vmxcache[dyn->v.avxcache[a].reg].t == VMX_CACHE_YMMW) { + STXV(VSXREG(dyn->v.avxcache[a].reg), offsetof(x64emu_t, xmm[a]), xEmu); + if (dyn->v.avxcache[a].zero_upper == 1) { + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + } + // else: upper 128 is already in ymm[a] memory + if (dyn->v.avxcache[a].zero_upper == 1) { + STXV(SCRATCH, offsetof(x64emu_t, ymm[a]), xEmu); + } + } + fpu_free_reg(dyn, dyn->v.avxcache[a].reg); + dyn->v.avxcache[a].v = -1; + return; +} + +void avx_reflect_reg(dynarec_ppc64le_t* dyn, int ninst, int a) +{ + dyn->v.ymm_used |= 1 << a; + if (dyn->v.avxcache[a].v == -1) + return; + if (dyn->v.vmxcache[dyn->v.avxcache[a].reg].t == VMX_CACHE_YMMW) { + STXV(VSXREG(dyn->v.avxcache[a].reg), offsetof(x64emu_t, xmm[a]), xEmu); + if (dyn->v.avxcache[a].zero_upper == 1) { + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + STXV(SCRATCH, offsetof(x64emu_t, ymm[a]), xEmu); + } + // else: upper 128 is already in ymm[a] memory, no store needed + dyn->v.avxcache[a].zero_upper = 0; + } +} + +// purge the AVX cache only +static void avx_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1) +{ + int old = -1; + for (int i = 0; i < 16; ++i) + if (dyn->v.avxcache[i].v != -1) { + dyn->v.ymm_used |= 1 << i; + if (dyn->v.avxcache[i].write) { + if (old == -1) { + MESSAGE(LOG_DUMP, "\tPurge %sAVX Cache ------\n", next ? "locally " : ""); + ++old; + } + if (dyn->v.vmxcache[dyn->v.avxcache[i].reg].t == VMX_CACHE_YMMW) { + STXV(VSXREG(dyn->v.avxcache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + if (dyn->v.avxcache[i].zero_upper == 1) { + XXLXOR(SCRATCH, SCRATCH, SCRATCH); + } + // else: upper 128 is already in ymm[i] memory + if (dyn->v.avxcache[i].zero_upper == 1) { + STXV(SCRATCH, offsetof(x64emu_t, ymm[i]), xEmu); + } + } + } + if (!next) { + fpu_free_reg(dyn, dyn->v.avxcache[i].reg); + dyn->v.avxcache[i].v = -1; + } + } + if (old != -1) { + MESSAGE(LOG_DUMP, "\t------ Purge AVX Cache\n"); + } +} + +static void avx_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + for (int i = 0; i < 16; ++i) { + if (dyn->v.avxcache[i].v != -1) { + dyn->v.ymm_used |= 1 << i; + if (dyn->v.avxcache[i].write) { + if (dyn->v.vmxcache[dyn->v.avxcache[i].reg].t == VMX_CACHE_YMMW) { + avx_reflect_reg(dyn, ninst, i); + } + } + } + } +} + +void fpu_pushcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int not07) +{ + int start = not07 ? 8 : 0; + int n = 0; + + for (int i = start; i < 16; i++) { + if ((dyn->v.ssecache[i].v != -1) && (dyn->v.ssecache[i].write)) + ++n; + if ((dyn->v.avxcache[i].v != -1) && (dyn->v.avxcache[i].write)) + ++n; + } + + if (n) { + MESSAGE(LOG_DUMP, "\tPush XMM/YMM Cache (%d)------\n", n); + for (int i = start; i < 16; ++i) { + if ((dyn->v.ssecache[i].v != -1) && (dyn->v.ssecache[i].write)) { + STXV(VSXREG(dyn->v.ssecache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + } + if ((dyn->v.avxcache[i].v != -1) && (dyn->v.avxcache[i].write)) { + STXV(VSXREG(dyn->v.avxcache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + // upper 128: if zero_upper==0, it's already in ymm[] memory; nothing to do + // if zero_upper==1, we don't need to push zeros (will be handled on reflect/purge) + } + } + MESSAGE(LOG_DUMP, "\t------- Pushed XMM/YMM Cache (%d)\n", n); + } +} + +void fpu_popcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int not07) +{ + int start = not07 ? 8 : 0; + int n = 0; + + for (int i = start; i < 16; i++) { + if (dyn->v.ssecache[i].v != -1 || dyn->v.avxcache[i].v != -1) + ++n; + } + + if (n) { + MESSAGE(LOG_DUMP, "\tPop XMM/YMM Cache (%d)------\n", n); + for (int i = start; i < 16; ++i) { + if (dyn->v.ssecache[i].v != -1) { + LXV(VSXREG(dyn->v.ssecache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + } + if (dyn->v.avxcache[i].v != -1) { + LXV(VSXREG(dyn->v.avxcache[i].reg), offsetof(x64emu_t, xmm[i]), xEmu); + // upper 128 stays in ymm[i] memory — nothing to load + } + } + MESSAGE(LOG_DUMP, "\t------- Pop XMM/YMM Cache (%d)\n", n); + } +} + +void fpu_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1, int s2, int s3) +{ + x87_purgecache(dyn, ninst, next, s1, s2, s3); + mmx_purgecache(dyn, ninst, next, s1); + sse_purgecache(dyn, ninst, next, s1); + avx_purgecache(dyn, ninst, next, s1); + if (!next) { + fpu_reset_reg(dyn); + dyn->insts[ninst].fpupurge = 1; + } +} + +void fpu_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3) +{ + x87_reflectcache(dyn, ninst, s1, s2, s3); + mmx_reflectcache(dyn, ninst, s1); + sse_reflectcache(dyn, ninst, s1); + avx_reflectcache(dyn, ninst, s1); +} + +void fpu_unreflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3) +{ + // need to undo the top and stack tracking that must not be reflected permanently yet + x87_unreflectcount(dyn, ninst, s1, s2); +} + +void emit_pf(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4) +{ + MAYUSE(dyn); + MAYUSE(ninst); + + // Compute parity of low byte: fold bits down to bit 0 + SRDI(s3, s1, 4); + XOR(s3, s3, s1); + + SRDI(s4, s3, 2); + XOR(s4, s3, s4); + + SRDI(s3, s4, 1); + XOR(s3, s3, s4); + + XORI(s3, s3, 1); + // Insert bit 0 of s3 into xFlags at F_PF position + ANDId(s4, s3, 1); // isolate bit 0 of s3, result in s4 + SLDI(s4, s4, F_PF); // shift to F_PF position + LI(s3, 1); + SLDI(s3, s3, F_PF); + ANDC(xFlags, xFlags, s3); // xFlags &= ~(1 << F_PF) + OR(xFlags, xFlags, s4); // xFlags |= (pf_bit << F_PF) +} + +void fpu_reset_cache(dynarec_ppc64le_t* dyn, int ninst, int reset_n) +{ + MESSAGE(LOG_DEBUG, "Reset Caches with %d\n", reset_n); + #if STEP > 1 + // for STEP 2 & 3, just need to refresh with current, and undo the changes (push & swap) + dyn->v = dyn->insts[ninst].v; + vmxcacheUnwind(&dyn->v); +#else + dyn->v = dyn->insts[reset_n].v; +#endif +#if STEP == 0 + if(dyn->need_dump && dyn->v.x87stack) dynarec_log(LOG_NONE, "New x87stack=%d at ResetCache in inst %d with %d\n", dyn->v.x87stack, ninst, reset_n); +#endif +#if defined(HAVE_TRACE) && (STEP > 2) + if (dyn->need_dump && 0) // disable for now + if (memcmp(&dyn->v, &dyn->insts[reset_n].v, sizeof(vmxcache_t))) { + MESSAGE(LOG_DEBUG, "Warning, difference in vmxcache: reset="); + for (int i = 0; i < 32; ++i) + if (dyn->insts[reset_n].v.vmxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[reset_n].v.vmxcache[i].t, dyn->insts[reset_n].v.vmxcache[i].n)); + if (dyn->insts[reset_n].v.combined1 || dyn->insts[reset_n].v.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[reset_n].v.swapped ? "SWP" : "CMB", dyn->insts[reset_n].v.combined1, dyn->insts[reset_n].v.combined2); + if (dyn->insts[reset_n].v.stack_push || dyn->insts[reset_n].v.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[reset_n].v.stack_push, -dyn->insts[reset_n].v.stack_pop); + MESSAGE(LOG_DEBUG, " ==> "); + for (int i = 0; i < 32; ++i) + if (dyn->insts[ninst].v.vmxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->insts[ninst].v.vmxcache[i].t, dyn->insts[ninst].v.vmxcache[i].n)); + if (dyn->insts[ninst].v.combined1 || dyn->insts[ninst].v.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->insts[ninst].v.swapped ? "SWP" : "CMB", dyn->insts[ninst].v.combined1, dyn->insts[ninst].v.combined2); + if (dyn->insts[ninst].v.stack_push || dyn->insts[ninst].v.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->insts[ninst].v.stack_push, -dyn->insts[ninst].v.stack_pop); + MESSAGE(LOG_DEBUG, " -> "); + for (int i = 0; i < 32; ++i) + if (dyn->v.vmxcache[i].v) + MESSAGE(LOG_DEBUG, " %02d:%s", i, getCacheName(dyn->v.vmxcache[i].t, dyn->v.vmxcache[i].n)); + if (dyn->v.combined1 || dyn->v.combined2) + MESSAGE(LOG_DEBUG, " %s:%02d/%02d", dyn->v.swapped ? "SWP" : "CMB", dyn->v.combined1, dyn->v.combined2); + if (dyn->v.stack_push || dyn->v.stack_pop) + MESSAGE(LOG_DEBUG, " (%d:%d)", dyn->v.stack_push, -dyn->v.stack_pop); + MESSAGE(LOG_DEBUG, "\n"); + } +#endif // HAVE_TRACE +} + +// propagate ST stack state, especial stack pop that are deferred +void fpu_propagate_stack(dynarec_ppc64le_t* dyn, int ninst) +{ + if (dyn->v.stack_pop) { + for (int j = 0; j < 32; ++j) + if ((dyn->v.vmxcache[j].t == VMX_CACHE_ST_D + || dyn->v.vmxcache[j].t == VMX_CACHE_ST_F + || dyn->v.vmxcache[j].t == VMX_CACHE_ST_I64)) { + if (dyn->v.vmxcache[j].n < dyn->v.stack_pop) + dyn->v.vmxcache[j].v = 0; + else + dyn->v.vmxcache[j].n -= dyn->v.stack_pop; + } + dyn->v.stack_pop = 0; + } + dyn->v.stack = dyn->v.stack_next; + dyn->v.news = 0; + dyn->v.stack_push = 0; + dyn->v.swapped = 0; +} + + +static int findCacheSlot(dynarec_ppc64le_t* dyn, int ninst, int t, int n, vmxcache_t* cache) +{ + vmx_cache_t f; + f.n = n; + f.t = t; + for (int i = 0; i < 32; ++i) { + if (cache->vmxcache[i].v == f.v) + return i; + if (cache->vmxcache[i].n == n) { + switch (cache->vmxcache[i].t) { + case VMX_CACHE_ST_F: + if (t == VMX_CACHE_ST_D) + return i; + if (t == VMX_CACHE_ST_I64) + return i; + break; + case VMX_CACHE_ST_D: + if (t == VMX_CACHE_ST_F) + return i; + if (t == VMX_CACHE_ST_I64) + return i; + break; + case VMX_CACHE_ST_I64: + if (t == VMX_CACHE_ST_F) + return i; + if (t == VMX_CACHE_ST_D) + return i; + break; + case VMX_CACHE_XMMR: + if (t == VMX_CACHE_XMMW) + return i; + break; + case VMX_CACHE_XMMW: + if (t == VMX_CACHE_XMMR) + return i; + break; + case VMX_CACHE_YMMR: + if (t == VMX_CACHE_YMMW) + return i; + break; + case VMX_CACHE_YMMW: + if (t == VMX_CACHE_YMMR) + return i; + break; + } + } + } + return -1; +} + +static void swapCache(dynarec_ppc64le_t* dyn, int ninst, int i, int j, vmxcache_t* cache) +{ + if (i == j) + return; + int quad = 0; + int mmx = 0; + if (cache->vmxcache[i].t == VMX_CACHE_XMMR || cache->vmxcache[i].t == VMX_CACHE_XMMW) + quad = 1; + if (cache->vmxcache[j].t == VMX_CACHE_XMMR || cache->vmxcache[j].t == VMX_CACHE_XMMW) + quad = 1; + if (cache->vmxcache[i].t == VMX_CACHE_YMMR || cache->vmxcache[i].t == VMX_CACHE_YMMW) + quad = 1; // PPC64LE: YMM in register is still only 128-bit (lower half) + if (cache->vmxcache[j].t == VMX_CACHE_YMMR || cache->vmxcache[j].t == VMX_CACHE_YMMW) + quad = 1; // PPC64LE: YMM in register is still only 128-bit (lower half) + if (cache->vmxcache[i].t == VMX_CACHE_MM || cache->vmxcache[j].t == VMX_CACHE_MM) + mmx = 1; // MMX lives in VR space, needs XXLOR with VSXREG + + if (!cache->vmxcache[i].v) { + // a mov is enough, no need to swap + MESSAGE(LOG_DUMP, "\t - Moving %d <- %d\n", i, j); + if (quad) { + XXLOR(VSXREG(i), VSXREG(j), VSXREG(j)); // 128-bit copy + } else if (mmx) { + XXLOR(VSXREG(i), VSXREG(j), VSXREG(j)); // MMX in VR space + } else { + FMR(i, j); // 64-bit copy (x87 ST, indices 16-23 in FPR space) + } + cache->vmxcache[i].v = cache->vmxcache[j].v; + cache->vmxcache[j].v = 0; + return; + } + // SWAP + vmx_cache_t tmp; + MESSAGE(LOG_DUMP, "\t - Swapping %d <-> %d\n", i, j); + // Use SCRATCH as temporary + if (quad) { + XXLOR(SCRATCH, VSXREG(i), VSXREG(i)); // 128-bit swap via scratch + XXLOR(VSXREG(i), VSXREG(j), VSXREG(j)); + XXLOR(VSXREG(j), SCRATCH, SCRATCH); + } else if (mmx) { + XXLOR(SCRATCH, VSXREG(i), VSXREG(i)); // MMX swap via scratch (VR space) + XXLOR(VSXREG(i), VSXREG(j), VSXREG(j)); + XXLOR(VSXREG(j), SCRATCH, SCRATCH); + } else { + FMR(SCRATCH, i); // 64-bit swap via scratch (x87 ST, FPR space) + FMR(i, j); + FMR(j, SCRATCH); + } + tmp.v = cache->vmxcache[i].v; + cache->vmxcache[i].v = cache->vmxcache[j].v; + cache->vmxcache[j].v = tmp.v; +} + +static void loadCache(dynarec_ppc64le_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, vmxcache_t* cache, int i, int t, int n) +{ + if (cache->vmxcache[i].v) { + int quad = 0; + int mmx = 0; + if (t == VMX_CACHE_XMMR || t == VMX_CACHE_XMMW) + quad = 1; + if (t == VMX_CACHE_YMMR || t == VMX_CACHE_YMMW) + quad = 1; // PPC64LE: only 128-bit in register + if (cache->vmxcache[i].t == VMX_CACHE_XMMR || cache->vmxcache[i].t == VMX_CACHE_XMMW) + quad = 1; + if (cache->vmxcache[i].t == VMX_CACHE_YMMR || cache->vmxcache[i].t == VMX_CACHE_YMMW) + quad = 1; + if (t == VMX_CACHE_MM || cache->vmxcache[i].t == VMX_CACHE_MM) + mmx = 1; + int j = i + 1; + while (cache->vmxcache[j].v) + ++j; + MESSAGE(LOG_DUMP, "\t - Moving away %d\n", i); + if (quad) { + XXLOR(VSXREG(j), VSXREG(i), VSXREG(i)); // 128-bit move + } else if (mmx) { + XXLOR(VSXREG(j), VSXREG(i), VSXREG(i)); // MMX in VR space + } else { + FMR(j, i); // 64-bit move (x87 ST in FPR space) + } + cache->vmxcache[j].v = cache->vmxcache[i].v; + } + switch (t) { + case VMX_CACHE_YMMR: + case VMX_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + // Load lower 128 from xmm[n]; upper 128 stays in ymm[n] memory + LXV(VSXREG(i), offsetof(x64emu_t, xmm[n]), xEmu); + break; + case VMX_CACHE_XMMR: + case VMX_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + LXV(VSXREG(i), offsetof(x64emu_t, xmm[n]), xEmu); + break; + case VMX_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + LD(s1, offsetof(x64emu_t, mmx[n]), xEmu); + MTVSRDD(VSXREG(i), xZR, s1); + break; + case VMX_CACHE_ST_D: + case VMX_CACHE_ST_F: + case VMX_CACHE_ST_I64: + MESSAGE(LOG_DUMP, "\t - Loading %s\n", getCacheName(t, n)); + if ((*s3_top) == 0xffff) { + LWZ(s3, offsetof(x64emu_t, top), xEmu); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if (a) { + ADDI(s3, s3, a); + ANDId(s3, s3, 7); // (emu->top + i)&7 + } + *s3_top += a; + *s2_val = 0; + SLDI(s2, s3, 3); + ADD(s2, xEmu, s2); + LFD(i, offsetof(x64emu_t, x87), s2); + if (t == VMX_CACHE_ST_F) { + FRSP(i, i); + } + if (t == VMX_CACHE_ST_I64) { + FCTIDZ(i, i); + } + break; + case VMX_CACHE_NONE: + case VMX_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->vmxcache[i].n = n; + cache->vmxcache[i].t = t; +} + +static void unloadCache(dynarec_ppc64le_t* dyn, int ninst, int stack_cnt, int s1, int s2, int s3, int* s1_val, int* s2_val, int* s3_top, vmxcache_t* cache, int i, int t, int n) +{ + switch (t) { + case VMX_CACHE_XMMR: + case VMX_CACHE_YMMR: + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + case VMX_CACHE_XMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + STXV(VSXREG(i), offsetof(x64emu_t, xmm[n]), xEmu); + break; + case VMX_CACHE_YMMW: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + // Store lower 128 to xmm[n]; upper 128 is already in ymm[n] memory + STXV(VSXREG(i), offsetof(x64emu_t, xmm[n]), xEmu); + break; + case VMX_CACHE_MM: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + MFVSRLD(s1, VSXREG(i)); + STD(s1, offsetof(x64emu_t, mmx[n]), xEmu); + break; + case VMX_CACHE_ST_D: + case VMX_CACHE_ST_F: + case VMX_CACHE_ST_I64: + MESSAGE(LOG_DUMP, "\t - Unloading %s\n", getCacheName(t, n)); + if ((*s3_top) == 0xffff) { + LWZ(s3, offsetof(x64emu_t, top), xEmu); + *s3_top = 0; + } + int a = n - (*s3_top) - stack_cnt; + if (a) { + ADDI(s3, s3, a); + ANDId(s3, s3, 7); + } + *s3_top += a; + SLDI(s2, s3, 3); + ADD(s2, xEmu, s2); + *s2_val = 0; + if (t == VMX_CACHE_ST_F) { + // PPC64LE FPRs always hold doubles; ST_F is already double in register + // No conversion needed — just store as double + } + if (t == VMX_CACHE_ST_I64) { + FCFID(i, i); + } + STFD(i, offsetof(x64emu_t, x87), s2); + break; + case VMX_CACHE_NONE: + case VMX_CACHE_SCR: + default: /* nothing done */ + MESSAGE(LOG_DUMP, "\t - ignoring %s\n", getCacheName(t, n)); + break; + } + cache->vmxcache[i].v = 0; +} + +static void fpuCacheTransform(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3) +{ + int i2 = dyn->insts[ninst].x64.jmp_insts; + if (i2 < 0) + return; + MESSAGE(LOG_DUMP, "\tCache Transform ---- ninst=%d -> %d\n", ninst, i2); + if ((!i2) || (dyn->insts[i2].x64.barrier & BARRIER_FLOAT)) { + if (dyn->v.stack_next) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + for (int i = 0; i < 32; ++i) + if (dyn->v.vmxcache[i].v) { // there is something at ninst for i + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + vmxcache_t cache_i2 = dyn->insts[i2].v; + vmxcacheUnwind(&cache_i2); + + if (!cache_i2.stack) { + int purge = 1; + for (int i = 0; i < 32 && purge; ++i) + if (cache_i2.vmxcache[i].v) + purge = 0; + if (purge) { + fpu_purgecache(dyn, ninst, 1, s1, s2, s3); + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); + return; + } + } + int stack_cnt = dyn->v.stack_next; + int s3_top = 0xffff; + vmxcache_t cache = dyn->v; + int s1_val = 0; + int s2_val = 0; + // unload every unneeded cache + // check SSE first, then MMX, in order, for optimization issue + for (int i = 0; i < 16; ++i) { + int j = findCacheSlot(dyn, ninst, VMX_CACHE_YMMW, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, VMX_CACHE_YMMW, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.vmxcache[j].t, cache.vmxcache[j].n); + } + for (int i = 0; i < 16; ++i) { + int j = findCacheSlot(dyn, ninst, VMX_CACHE_XMMW, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, VMX_CACHE_XMMW, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.vmxcache[j].t, cache.vmxcache[j].n); + } + for (int i = 0; i < 8; ++i) { + int j = findCacheSlot(dyn, ninst, VMX_CACHE_MM, i, &cache); + if (j >= 0 && findCacheSlot(dyn, ninst, VMX_CACHE_MM, i, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, j, cache.vmxcache[j].t, cache.vmxcache[j].n); + } + for (int i = 0; i < 32; ++i) { + if (cache.vmxcache[i].v) + if (findCacheSlot(dyn, ninst, cache.vmxcache[i].t, cache.vmxcache[i].n, &cache_i2) == -1) + unloadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache.vmxcache[i].t, cache.vmxcache[i].n); + } + // and now load/swap the missing one + for (int i = 0; i < 32; ++i) { + if (cache_i2.vmxcache[i].v) { + if (cache_i2.vmxcache[i].v != cache.vmxcache[i].v) { + int j; + if ((j = findCacheSlot(dyn, ninst, cache_i2.vmxcache[i].t, cache_i2.vmxcache[i].n, &cache)) == -1) + loadCache(dyn, ninst, stack_cnt, s1, s2, s3, &s1_val, &s2_val, &s3_top, &cache, i, cache_i2.vmxcache[i].t, cache_i2.vmxcache[i].n); + else { + // it's here, lets swap if needed + if (j != i) + swapCache(dyn, ninst, i, j, &cache); + } + } + if (cache.vmxcache[i].t != cache_i2.vmxcache[i].t) { + if (cache.vmxcache[i].t == VMX_CACHE_ST_D && cache_i2.vmxcache[i].t == VMX_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + FRSP(i, i); + cache.vmxcache[i].t = VMX_CACHE_ST_F; + } else if (cache.vmxcache[i].t == VMX_CACHE_ST_F && cache_i2.vmxcache[i].t == VMX_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + // PPC64LE: ST_F is already a double in FPR, no conversion needed + FMR(i, i); // no-op move, just to be explicit + cache.vmxcache[i].t = VMX_CACHE_ST_D; + } else if (cache.vmxcache[i].t == VMX_CACHE_ST_D && cache_i2.vmxcache[i].t == VMX_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + FCTIDZ(i, i); + cache.vmxcache[i].t = VMX_CACHE_ST_I64; + } else if (cache.vmxcache[i].t == VMX_CACHE_ST_F && cache_i2.vmxcache[i].t == VMX_CACHE_ST_I64) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + // ST_F is double in FPR on PPC64LE, so FCTIDZ works directly + FCTIDZ(i, i); + cache.vmxcache[i].t = VMX_CACHE_ST_I64; + } else if (cache.vmxcache[i].t == VMX_CACHE_ST_I64 && cache_i2.vmxcache[i].t == VMX_CACHE_ST_F) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + FCFID(i, i); // int64 -> double + FRSP(i, i); // double -> single (round) + cache.vmxcache[i].t = VMX_CACHE_ST_F; + } else if (cache.vmxcache[i].t == VMX_CACHE_ST_I64 && cache_i2.vmxcache[i].t == VMX_CACHE_ST_D) { + MESSAGE(LOG_DUMP, "\t - Convert %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + FCFID(i, i); + cache.vmxcache[i].t = VMX_CACHE_ST_D; + } else if (cache.vmxcache[i].t == VMX_CACHE_XMMR && cache_i2.vmxcache[i].t == VMX_CACHE_XMMW) { + cache.vmxcache[i].t = VMX_CACHE_XMMW; + } else if (cache.vmxcache[i].t == VMX_CACHE_YMMR && cache_i2.vmxcache[i].t == VMX_CACHE_YMMW) { + cache.vmxcache[i].t = VMX_CACHE_YMMW; + } else if (cache.vmxcache[i].t == VMX_CACHE_YMMW && cache_i2.vmxcache[i].t == VMX_CACHE_YMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + // Store lower 128 to xmm[n]; upper 128 is already in ymm[n] memory + STXV(VSXREG(i), offsetof(x64emu_t, xmm[cache.vmxcache[i].n]), xEmu); + cache.vmxcache[i].t = VMX_CACHE_YMMR; + } else if (cache.vmxcache[i].t == VMX_CACHE_XMMW && cache_i2.vmxcache[i].t == VMX_CACHE_XMMR) { + // refresh cache... + MESSAGE(LOG_DUMP, "\t - Refreh %s\n", getCacheName(cache.vmxcache[i].t, cache.vmxcache[i].n)); + STXV(VSXREG(i), offsetof(x64emu_t, xmm[cache.vmxcache[i].n]), xEmu); + cache.vmxcache[i].t = VMX_CACHE_XMMR; + } + } + } + } + if (stack_cnt != cache_i2.stack) { + MESSAGE(LOG_DUMP, "\t - adjust stack count %d -> %d -\n", stack_cnt, cache_i2.stack); + int a = stack_cnt - cache_i2.stack; + // Add x87stack to emu fpu_stack + LWZ(s3, offsetof(x64emu_t, fpu_stack), xEmu); + ADDI(s3, s3, a); + STW(s3, offsetof(x64emu_t, fpu_stack), xEmu); + // Sub x87stack to top, with and 7 + LWZ(s3, offsetof(x64emu_t, top), xEmu); + ADDI(s3, s3, -a); + ANDId(s3, s3, 7); + STW(s3, offsetof(x64emu_t, top), xEmu); + // update tags + LHA(s2, offsetof(x64emu_t, fpu_tags), xEmu); + if (a > 0) { + SLDI(s2, s2, a * 2); + } else { + MOV32w(s3, 0xffff0000); + OR(s2, s2, s3); + SRDI(s2, s2, -a * 2); + } + STH(s2, offsetof(x64emu_t, fpu_tags), xEmu); + s3_top = 0; + stack_cnt = cache_i2.stack; + } + MESSAGE(LOG_DUMP, "\t---- Cache Transform\n"); +} + +static void flagsCacheTransform(dynarec_ppc64le_t* dyn, int ninst, int s1) +{ + int j64; + int jmp = dyn->insts[ninst].x64.jmp_insts; + if (jmp < 0) + return; + if (dyn->insts[jmp].f_exit == dyn->insts[jmp].f_entry) // flags will be fully known, nothing we can do more + return; + MESSAGE(LOG_DUMP, "\tFlags fetch ---- ninst=%d -> %d\n", ninst, jmp); + int go_fetch = 0; + switch (dyn->insts[jmp].f_entry) { + case status_unk: + if (dyn->insts[ninst].f_exit == status_none_pending) { + FORCE_DFNONE(); + } + break; + case status_set: + if (dyn->insts[ninst].f_exit == status_none_pending) { + FORCE_DFNONE(); + } + if (dyn->insts[ninst].f_exit == status_unk) + go_fetch = 1; + break; + case status_none_pending: + if (dyn->insts[ninst].f_exit != status_none) + go_fetch = 1; + break; + case status_none: + if (dyn->insts[ninst].f_exit == status_none_pending) { + FORCE_DFNONE(); + } else + go_fetch = 1; + break; + } + if (go_fetch) { + if (dyn->f == status_unk) { + LWZ(s1, offsetof(x64emu_t, df), xEmu); + CMPDI(s1, 0); + j64 = (GETMARKF2) - (dyn->native_size); + BEQ(j64); + } + CALL_(const_updateflags, -1, 0, 0, 0); + MARKF2; + } + MESSAGE(LOG_DUMP, "\t---- Flags fetch\n"); +} + +void CacheTransform(dynarec_ppc64le_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3) +{ + if (cacheupd & 2) + fpuCacheTransform(dyn, ninst, s1, s2, s3); + if (cacheupd & 1) + flagsCacheTransform(dyn, ninst, s1); +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_helper.h b/src/dynarec/ppc64le/dynarec_ppc64le_helper.h new file mode 100644 index 0000000000..e0ebe8d4a3 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_helper.h @@ -0,0 +1,2673 @@ +#ifndef __DYNAREC_PPC64LE_HELPER_H__ +#define __DYNAREC_PPC64LE_HELPER_H__ + +// undef to get Close to SSE Float->int conversions +// #define PRECISE_CVT + +#if STEP == 0 +#include "dynarec_ppc64le_pass0.h" +#elif STEP == 1 +#include "dynarec_ppc64le_pass1.h" +#elif STEP == 2 +#include "dynarec_ppc64le_pass2.h" +#elif STEP == 3 +#include "dynarec_ppc64le_pass3.h" +#endif + +#include "debug.h" +#include "ppc64le_emitter.h" +#include "../emu/x64primop.h" +#include "dynarec_ppc64le_consts.h" + +// DQ_ALIGN: OR this into the i12 parameter of geted() to require 16-byte +// alignment for fixedaddress (needed for DQ-form instructions like LXV/STXV). +// Without this flag, geted() only requires 4-byte alignment (for D-form LD/STD). +// +// Usage: pass DQ_ALIGN|1 as the i12 argument to geted() whenever the resulting +// fixedaddress will be used as the displacement for LXV or STXV. The flag is +// stripped inside geted() and only affects the align_mask (15 vs 3). +// +// WHY THIS EXISTS: +// PPC64LE DQ-form instructions (LXV/STXV) silently truncate the low 4 bits of +// the displacement — see the detailed explanation in ppc64le_emitter.h at the +// DQ-form section. Without DQ_ALIGN, geted() would accept displacements like +// 0x24 (4-byte aligned) as valid inline immediates, but LXV would encode them +// as 0x20, loading from the wrong address with no error. +// +// WHERE TO USE: +// - GETEX, GETEYx, GETEYy, GETEYx_empty, GETEYy_empty macros (already correct) +// - Direct MOVAPS/MOVAPD/MOVUPS/MOVUPD/MOVDQA/MOVDQU load/store callsites +// - Any new opcode that calls geted() and then uses fixedaddress with LXV/STXV +// +// WHERE NOT NEEDED: +// - GETEXSD, GETEYSD (use LD, which is D-form — 4-byte alignment is fine) +// - GETEXSS, GETEYSS (use LWZ, which is D-form) +// - Any geted() call where fixedaddress is used with LD/STD/LWZ/STW +#define DQ_ALIGN 0x100 + +#define F8 *(uint8_t*)(addr++) +#define F8S *(int8_t*)(addr++) +#define F16 *(uint16_t*)(addr += 2, addr - 2) +#define F16S *(int16_t*)(addr += 2, addr - 2) +#define F32 *(uint32_t*)(addr += 4, addr - 4) +#define F32S *(int32_t*)(addr += 4, addr - 4) +#define F32S64 (uint64_t)(int64_t)F32S +#define F64 *(uint64_t*)(addr += 8, addr - 8) +#define PK(a) *(uint8_t*)(addr + a) +#define PK16(a) *(uint16_t*)(addr + a) +#define PK32(a) *(uint32_t*)(addr + a) +#define PK64(a) *(uint64_t*)(addr + a) +#define PKip(a) *(uint8_t*)(ip + a) + +// LOCK_* define +#define LOCK_LOCK (int*)1 + +// AVX width constants +#define VMX_AVX_WIDTH_128 0 +#define VMX_AVX_WIDTH_256 1 + +// ======================================================================== +// Convenience macros for PPC64LE (equivalent to LA64's LDxw, SDxw, etc.) +// ======================================================================== +// LDxw: load 32 or 64 bits depending on rex.w +// For 64-bit (LD), offset must be 4-aligned (DS-form); geted() guarantees this when i12=1. +#define LDxw(Rt, Ra, offset) \ + if (rex.w) { LD(Rt, offset, Ra); } else { LWZ(Rt, offset, Ra); } +// LDz: load 32 or 64 bits depending on rex.is32bits +#define LDz(Rt, Ra, offset) \ + if (rex.is32bits) { LWZ(Rt, offset, Ra); } else { LD(Rt, offset, Ra); } +// SDxw: store 32 or 64 bits depending on rex.w +// For 64-bit (STD), offset must be 4-aligned (DS-form); geted() guarantees this when i12=1. +#define SDxw(Rs, Ra, offset) \ + if (rex.w) { STD(Rs, offset, Ra); } else { STW(Rs, offset, Ra); } +// SDz: store 32 or 64 bits depending on rex.is32bits +#define SDz(Rs, Ra, offset) \ + if (rex.is32bits) { STW(Rs, offset, Ra); } else { STD(Rs, offset, Ra); } + +// SDXxw: indexed store 32 or 64 bits +#define SDXxw(Rs, Ra, Rb) \ + if (rex.w) { STDX(Rs, Ra, Rb); } else { STWX(Rs, Ra, Rb); } + +// ADDxw: add then optionally zero upper 32 bits +#define ADDxw(Rt, Ra, Rb) \ + ADD(Rt, Ra, Rb); if (!rex.w) { RLDICL(Rt, Rt, 0, 32); } +// ADDz: add based on is32bits +#define ADDz(Rt, Ra, Rb) \ + ADD(Rt, Ra, Rb); if (rex.is32bits) { RLDICL(Rt, Rt, 0, 32); } +// SUBxw: sub then optionally zero upper 32 bits +#define SUBxw(Rt, Ra, Rb) \ + SUB(Rt, Ra, Rb); if (!rex.w) { RLDICL(Rt, Rt, 0, 32); } +// ADDIxw: addi then optionally zero upper 32 bits +#define ADDIxw(Rt, Ra, imm) \ + ADDI(Rt, Ra, imm); if (!rex.w) { RLDICL(Rt, Rt, 0, 32); } +// ADDIz: addi then optionally zero upper 32 bits (based on is32bits) +#define ADDIz(Rt, Ra, imm) \ + ADDI(Rt, Ra, imm); if (rex.is32bits) { RLDICL(Rt, Rt, 0, 32); } + +// ZEROUP: clear upper 32 bits +#define ZEROUP(Rd) RLDICL(Rd, Rd, 0, 32) + +// ZEROUP2: clear upper 32 bits (different src/dst) +#define ZEROUP2(Rd, Rs) RLDICL(Rd, Rs, 0, 32) + +// SEXT_W: sign extend 32-bit to 64-bit +#define SEXT_W(Rd, Rs) EXTSW(Rd, Rs) + +// MVxw: move, optionally zero upper 32 bits +#define MVxw(Rd, Rs) \ + if (rex.w) { MV(Rd, Rs); } else { RLDICL(Rd, Rs, 0, 32); } + +// MVz: move based on is32bits +#define MVz(Rd, Rs) \ + do { \ + if (rex.is32bits) { \ + ZEROUP2(Rd, Rs); \ + } else { \ + MV(Rd, Rs); \ + } \ + } while (0) + +// MOV64y: load constant 32 or 64 depending on is32bits/is67 +#define MOV64y(A, B) \ + do { \ + if (rex.is32bits || rex.is67) { \ + MOV32w(A, B); \ + } else { \ + MOV64x(A, B); \ + } \ + } while (0) + +// SLLIxw: shift left immediate, conditional 32/64 based on rex.w +#define SLLIxw(Rd, Rs, imm) \ + do { \ + if (rex.w) { \ + SLDI(Rd, Rs, imm); \ + } else { \ + SLWI(Rd, Rs, imm); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SRLIxw: shift right logical immediate, conditional 32/64 based on rex.w +#define SRLIxw(Rd, Rs, imm) \ + do { \ + if (rex.w) { \ + SRDI(Rd, Rs, imm); \ + } else { \ + SRWI(Rd, Rs, imm); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SRAIxw: shift right arithmetic immediate, conditional 32/64 based on rex.w +#define SRAIxw(Rd, Rs, imm) \ + do { \ + if (rex.w) { \ + SRADI(Rd, Rs, imm); \ + } else { \ + SRAWI(Rd, Rs, imm); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SLLxw: variable shift left, conditional 32/64 based on rex.w +#define SLLxw(Rd, Rs, Rb) \ + do { \ + if (rex.w) { \ + SLD(Rd, Rs, Rb); \ + } else { \ + SLW(Rd, Rs, Rb); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SRLxw: variable shift right logical, conditional 32/64 based on rex.w +#define SRLxw(Rd, Rs, Rb) \ + do { \ + if (rex.w) { \ + SRD(Rd, Rs, Rb); \ + } else { \ + SRW(Rd, Rs, Rb); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SRAxw: variable shift right arithmetic, conditional 32/64 based on rex.w +#define SRAxw(Rd, Rs, Rb) \ + do { \ + if (rex.w) { \ + SRAD(Rd, Rs, Rb); \ + } else { \ + SRAW(Rd, Rs, Rb); \ + ZEROUP(Rd); \ + } \ + } while (0) + +// SLLIy: shift left, zero-extend if 32-bit mode +#define SLLIy(Rd, Rs, imm) \ + do { \ + if (rex.is32bits || rex.is67) { \ + SLWI(Rd, Rs, imm); \ + ZEROUP(Rd); \ + } else \ + SLDI(Rd, Rs, imm); \ + } while (0) + +// ALSLy: add-shift-left (rd = (rj << imm) + rk), conditional 32/64 +// PPC64LE has no ALSL_D, so: if imm==0 then ADD, else SLDI+ADD +// IMPORTANT: When rd==rk, shift into r0 (scratch) first to avoid clobbering rk. +// This includes the case rd==rj==rk (e.g. LEA rax,[rax+rax*4] => rax = rax*5). +// r0 is safe in X-form instructions (SLDI, SLWI, ADD) — only D-form base treats r0 as literal 0. +#define ALSLy(rd, rj, rk, imm) \ + do { \ + if ((imm) == 0) { \ + if (rex.is32bits || rex.is67) { \ + ADD(rd, rj, rk); \ + ZEROUP(rd); \ + } else { \ + ADD(rd, rj, rk); \ + } \ + } else if ((rd) == (rk)) { \ + /* rd==rk: shift rj into r0 to avoid clobbering rk */ \ + if (rex.is32bits || rex.is67) { \ + SLWI(0, rj, imm); \ + ADD(rd, 0, rk); \ + ZEROUP(rd); \ + } else { \ + SLDI(0, rj, imm); \ + ADD(rd, 0, rk); \ + } \ + } else { \ + if (rex.is32bits || rex.is67) { \ + SLWI(rd, rj, imm); \ + ADD(rd, rd, rk); \ + ZEROUP(rd); \ + } else { \ + SLDI(rd, rj, imm); \ + ADD(rd, rd, rk); \ + } \ + } \ + } while (0) + +// ADDy: add, conditional 32/64 +#define ADDy(rd, rj, rk) \ + do { \ + if (rex.is32bits || rex.is67) { \ + ADD(rd, rj, rk); \ + ZEROUP(rd); \ + } else \ + ADD(rd, rj, rk); \ + } while (0) + +// ADDIy: addi, conditional 32/64 +#define ADDIy(rd, rj, imm) \ + do { \ + if (rex.is32bits || rex.is67) { \ + ADDI(rd, rj, imm); \ + ZEROUP(rd); \ + } else \ + ADDI(rd, rj, imm); \ + } while (0) + +// ADDxREGy: add, zero-extending rk to 32-bit first if needed (for seg+reg in 32-bit mode) +#define ADDxREGy(rd, rj, rk, s1) \ + do { \ + if (rex.is32bits || rex.is67) { \ + ZEROUP2(s1, rk); \ + ADD(rd, rj, s1); \ + } else \ + ADD(rd, rj, rk); \ + } while (0) + +// BSTRPICK_D equivalent: extract bits hi:lo from Rs into Ra +// RLDICL(Ra, Rs, (64-lo)%64, 64-(hi-lo+1)) +#define BSTRPICK_D(Ra, Rs, hi, lo) \ + RLDICL(Ra, Rs, (64-(lo))%64, 64-((hi)-(lo)+1)) + +// BSTRINS_D equivalent: insert low bits of Rs into Ra at position hi:lo +// RLDIMI(Ra, Rs, lo, 63-hi) +// IMPORTANT: When Rs == xZR (r0), r0 is NOT a zero register on PPC64LE. +// We load 0 into x4 scratch and use that instead. +#define BSTRINS_D(Ra, Rs, hi, lo) \ + do { \ + if ((Rs) == xZR) { \ + LI(x4, 0); \ + RLDIMI(Ra, x4, lo, 63-(hi)); \ + } else { \ + RLDIMI(Ra, Rs, lo, 63-(hi)); \ + } \ + } while (0) + +// ======================================================================== +// GETGD get x64 register in gd +// ======================================================================== +#define GETGD gd = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); +// GETVD get x64 register in vd +#define GETVD vd = TO_NAT(vex.v) + +// GETGW extract x64 register in gd, that is i +#define GETGW(i) \ + gd = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); \ + BSTRPICK_D(i, gd, 15, 0); \ + gd = i; + +// GETED can use r1 for ed, and r2 for wback. wback is 0 if ed is xEAX..xEDI +#define GETED(D) \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wback = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + LDxw(x1, wback, fixedaddress); \ + ed = x1; \ + } + +#define GETEDz(D) \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wback = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + LDz(x1, wback, fixedaddress); \ + ed = x1; \ + } + +// GETEDH can use hint for wback and ret for ed. wback is 0 if ed is xEAX..xEDI +#define GETEDH(hint, ret, D) \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wback = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, hint, ret, &fixedaddress, rex, NULL, 1, D); \ + ed = ret; \ + LDxw(ed, wback, fixedaddress); \ + } +#define GETEDW(hint, ret, D) \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + MV(ret, ed); \ + wback = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, hint, ret, &fixedaddress, rex, NULL, 0, D); \ + ed = ret; \ + LDxw(ed, wback, fixedaddress); \ + } +// GETEWW will use i for ed, and can use w for wback. +#define GETEWW(w, i, D) \ + if (MODREG) { \ + wback = TO_NAT((nextop & 7) + (rex.b << 3)); \ + BSTRPICK_D(i, wback, 15, 0); \ + ed = i; \ + wb1 = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, w, i, &fixedaddress, rex, NULL, 1, D); \ + LHZ(i, fixedaddress, wback); \ + ed = i; \ + wb1 = 1; \ + } +// GETEW will use i for ed, and can use r3 for wback. +#define GETEW(i, D) GETEWW(x3, i, D) + +#define GETGWEW(i, j, D) \ + GETEW(j, D); \ + if (MODREG) { \ + gd = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); \ + if (gd == wback) \ + gd = ed; \ + else { \ + GETGW(i); \ + } \ + } else { \ + GETGW(i); \ + } + +// GETSED can use r1 for ed, and r2 for wback. ed will be sign extended! +#define GETSED(D) \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wback = 0; \ + if (!rex.w) { \ + EXTSW(x1, ed); \ + ed = x1; \ + } \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + if (rex.w) \ + LD(x1, fixedaddress, wback); \ + else \ + LWA(x1, fixedaddress, wback); \ + ed = x1; \ + } + +// FAKEED like GETED, but doesn't get anything +#define FAKEED \ + if (MODREG) { \ + ed = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wback = 0; \ + } else { \ + addr = fakeed(dyn, addr, ninst, nextop); \ + } + +// GETGW extract x64 register in gd, that is i, Signed extended +#define GETSGW(i) \ + gd = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); \ + EXTSH(i, gd); \ + gd = i; + +// Write back ed in wback (if wback not 0) +#define WBACK \ + if (wback) { \ + SDxw(ed, wback, fixedaddress); \ + SMWRITE(); \ + } + +#define WBACKO(O) \ + if (wback) { \ + if (rex.is32bits) { \ + ADDz(O, wback, O); \ + STW(ed, 0, O); \ + } else { \ + SDXxw(ed, wback, O); \ + } \ + SMWRITE2(); \ + } + +// GETSEW will use i for ed, and can use r3 for wback. This is the Signed version +#define GETSEW(i, D) \ + if (MODREG) { \ + wback = TO_NAT((nextop & 7) + (rex.b << 3)); \ + EXTSH(i, wback); \ + ed = i; \ + wb1 = 0; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x3, i, &fixedaddress, rex, NULL, 1, D); \ + LHA(i, fixedaddress, wback); \ + ed = i; \ + wb1 = 1; \ + } +// Write w back to original register / memory (w needs to be 16bits only!) +#define EWBACKW(w) \ + if (wb1) { \ + STH(w, fixedaddress, wback); \ + SMWRITE(); \ + } else { \ + BSTRINS_D(wback, w, 15, 0); \ + } +// Write ed back to original register / memory +#define EWBACK EWBACKW(ed) + +// Write back gd in correct register +#define GWBACK BSTRINS_D((TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3))), gd, 15, 0); + +// GETEB will use i for ed, and can use r3 for wback. +#define GETEB(i, D) \ + if (MODREG) { \ + if (rex.rex) { \ + wback = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wb2 = 0; \ + } else { \ + wback = (nextop & 7); \ + wb2 = (wback >> 2) * 8; \ + wback = TO_NAT((wback & 3)); \ + } \ + BSTRPICK_D(i, wback, wb2 + 7, wb2); \ + wb1 = 0; \ + ed = i; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + LBZ(i, fixedaddress, wback); \ + wb1 = 1; \ + ed = i; \ + } +// GETSEB sign extend EB, will use i for ed, and can use r3 for wback. +#define GETSEB(i, D) \ + if (MODREG) { \ + if (rex.rex) { \ + wback = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wb2 = 0; \ + } else { \ + wback = (nextop & 7); \ + wb2 = (wback >> 2) * 8; \ + wback = TO_NAT(wback & 3); \ + } \ + if (wb2) { \ + SRDI(i, wback, wb2); \ + EXTSB(i, i); \ + } else { \ + EXTSB(i, wback); \ + } \ + wb1 = 0; \ + ed = i; \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + LBZ(i, fixedaddress, wback); \ + EXTSB(i, i); \ + wb1 = 1; \ + ed = i; \ + } +// GETEB32 will use i for ed, and can use r3 for wback. +#define GETEB32(i, D) \ + if (MODREG) { \ + if (rex.rex) { \ + wback = TO_NAT((nextop & 7) + (rex.b << 3)); \ + wb2 = 0; \ + } else { \ + wback = (nextop & 7); \ + wb2 = (wback >> 2) * 8; \ + wback = TO_NAT(wback & 3); \ + } \ + BSTRPICK_D(i, wback, wb2 + 7, wb2); \ + wb1 = 0; \ + ed = i; \ + } else { \ + SMREAD(); \ + addr = geted32(dyn, addr, ninst, nextop, &wback, x3, x2, &fixedaddress, rex, NULL, 1, D); \ + LBZ(i, fixedaddress, wback); \ + wb1 = 1; \ + ed = i; \ + } + +// GETGB will use i for gd +#define GETGB(i) \ + if (rex.rex) { \ + gb1 = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); \ + gb2 = 0; \ + } else { \ + gd = (nextop & 0x38) >> 3; \ + gb2 = ((gd & 4) << 1); \ + gb1 = TO_NAT((gd & 3)); \ + } \ + gd = i; \ + BSTRPICK_D(gd, gb1, gb2 + 7, gb2); + +#define GETGBEB(i, j, D) \ + GETEB(j, D); \ + if (MODREG) { \ + if (rex.rex) { \ + gb1 = TO_NAT(((nextop & 0x38) >> 3) + (rex.r << 3)); \ + gb2 = 0; \ + } else { \ + gd = (nextop & 0x38) >> 3; \ + gb2 = ((gd & 4) << 1); \ + gb1 = TO_NAT((gd & 3)); \ + } \ + if (gb1 == wback && gb2 == wb2) \ + gd = ed; \ + else { \ + GETGB(i); \ + } \ + } else { \ + GETGB(i); \ + } + +// Get GX as a quad (might use x1) +#define GETGX(a, w) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = sse_get_reg(dyn, ninst, x1, gd, w) + + +#define GETGX_empty(a) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = sse_get_reg_empty(dyn, ninst, x1, gd) + +// Get EX as a quad, (x1 is used) +#define GETEX(a, w, D) \ + if (MODREG) { \ + a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x3, x2, &fixedaddress, rex, NULL, DQ_ALIGN|1, D); \ + a = fpu_get_scratch(dyn); \ + LXV(VSXREG(a), fixedaddress, ed); \ + } + +// Put Back EX if it was a memory and not an emm register +#define PUTEX(a) \ + if (!MODREG) { \ + STXV(VSXREG(a), fixedaddress, ed); \ + SMWRITE2(); \ + } + +// Get Ex as a double, not a quad (warning, x1 get used, x2 might too) +#define GETEXSD(a, w, D) \ + if (MODREG) { \ + a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \ + } else { \ + SMREAD(); /* TODO */ \ + a = fpu_get_scratch(dyn); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \ + LD(x4, fixedaddress, ed); \ + MTVSRD(VSXREG(a), x4); \ + } + +// Get Ex as 64bits, not a quad (warning, x1 get used, x2 might too) +#define GETEX64(a, w, D) GETEXSD(a, w, D) + +// Get Ex as a single, not a quad (warning, x1 get used) +#define GETEXSS(a, w, D) \ + if (MODREG) { \ + a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \ + } else { \ + SMREAD(); \ + a = fpu_get_scratch(dyn); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \ + LWZ(x4, fixedaddress, ed); \ + SLDI(x4, x4, 32); \ + MTVSRD(VSXREG(a), x4); \ + XSCVSPDPN(VSXREG(a), VSXREG(a)); \ + } + +// Get Ex as 32bits, not a quad (warning, x1 get used) +#define GETEX32(a, w, D) GETEXSS(a, w, D) + +// Get Ex as 16bits, not a quad (warning, x1 get used) +#define GETEX16(a, w, D) \ + if (MODREG) { \ + a = sse_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w); \ + } else { \ + SMREAD(); \ + a = fpu_get_scratch(dyn); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \ + LHZ(x2, fixedaddress, ed); \ + /* TODO: move x2 to FPU reg a */ \ + } + +// Get GM, might use x1, x2 and x3 +#define GETGM(a) \ + gd = ((nextop & 0x38) >> 3); \ + a = mmx_get_reg(dyn, ninst, x1, x2, x3, gd) + +// Get EM, might use x1, x2 and x3 +#define GETEM(a, D) \ + if (MODREG) { \ + a = mmx_get_reg(dyn, ninst, x1, x2, x3, (nextop & 7)); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x1, x2, &fixedaddress, rex, NULL, 1, D); \ + a = fpu_get_scratch(dyn); \ + LD(x3, fixedaddress, ed); \ + MTVSRDD(VSXREG(a), xZR, x3); \ + } + +// Put Back Em if it was a memory and not an emm register +#define PUTEM(a) \ + if (!MODREG) { \ + MFVSRLD(x3, VSXREG(a)); \ + STD(x3, fixedaddress, ed); \ + SMWRITE2(); \ + } + +// Write gb (gd) back to original register / memory, using s1 as scratch +#define GBBACK() BSTRINS_D(gb1, gd, gb2 + 7, gb2); + +// Generic get GD, but reg value in gd (R_RAX is not added) +#define GETG gd = ((nextop & 0x38) >> 3) + (rex.r << 3) + +// Write eb (ed) back to original register / memory, using s1 as scratch +#define EBBACK() \ + if (wb1) { \ + STB(ed, fixedaddress, wback); \ + SMWRITE(); \ + } else { \ + BSTRINS_D(wback, ed, wb2 + 7, wb2); \ + } + +#define YMM_UNMARK_UPPER_ZERO(a) \ + do { \ + dyn->v.avxcache[a].zero_upper = 0; \ + } while (0) + +// AVX helpers +// Get VX (might use x1) +#define GETVYx(a, w) \ + a = avx_get_reg(dyn, ninst, x1, vex.v, w, VMX_AVX_WIDTH_128) + +#define GETVYy(a, w) \ + a = avx_get_reg(dyn, ninst, x1, vex.v, w, VMX_AVX_WIDTH_256) + +// Get an empty VX (use x1) +#define GETVYx_empty(a) \ + a = avx_get_reg_empty(dyn, ninst, x1, vex.v, VMX_AVX_WIDTH_128) + +#define GETVYy_empty(a) \ + a = avx_get_reg_empty(dyn, ninst, x1, vex.v, VMX_AVX_WIDTH_256) + +// Get GX as a quad (might use x1) +#define GETGYx(a, w) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = avx_get_reg(dyn, ninst, x1, gd, w, VMX_AVX_WIDTH_128) + +#define GETGYy(a, w) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = avx_get_reg(dyn, ninst, x1, gd, w, VMX_AVX_WIDTH_256) + +#define GETGYx_empty(a) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = avx_get_reg_empty(dyn, ninst, x1, gd, VMX_AVX_WIDTH_128) + +#define GETGYy_empty(a) \ + gd = ((nextop & 0x38) >> 3) + (rex.r << 3); \ + a = avx_get_reg_empty(dyn, ninst, x1, gd, VMX_AVX_WIDTH_256) + +// Get EY as a quad, (x1 is used) +#define GETEYx(a, w, D) \ + if (MODREG) { \ + a = avx_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w, VMX_AVX_WIDTH_128); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, DQ_ALIGN|1, D); \ + a = fpu_get_scratch(dyn); \ + LXV(VSXREG(a), fixedaddress, ed); \ + } + +#define GETEYy(a, w, D) \ + if (MODREG) { \ + a = avx_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w, VMX_AVX_WIDTH_256); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, DQ_ALIGN|1, D); \ + a = fpu_get_scratch(dyn); \ + /* lower 128 bits; upper 128 must be loaded inline by each 256-bit opcode */ \ + LXV(VSXREG(a), fixedaddress, ed); \ + } + +#define GETEYx_empty(a, D) \ + if (MODREG) { \ + a = avx_get_reg_empty(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), VMX_AVX_WIDTH_128); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, DQ_ALIGN|1, D); \ + a = fpu_get_scratch(dyn); \ + } + +#define GETEYy_empty(a, D) \ + if (MODREG) { \ + a = avx_get_reg_empty(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), VMX_AVX_WIDTH_256); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, DQ_ALIGN|1, D); \ + a = fpu_get_scratch(dyn); \ + } + +// Get EY as 32bits, (x1 is used) +#define GETEYSS(a, w, D) \ + if (MODREG) { \ + a = avx_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w, VMX_AVX_WIDTH_128); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + a = fpu_get_scratch(dyn); \ + LWZ(x4, fixedaddress, ed); \ + SLDI(x4, x4, 32); \ + MTVSRD(VSXREG(a), x4); \ + XSCVSPDPN(VSXREG(a), VSXREG(a)); \ + } + +#define PUTEYSS(a) \ + if (!MODREG) { \ + MFVSRLD(x4, VSXREG(a)); \ + STW(x4, fixedaddress, ed); \ + SMWRITE2(); \ + } + +// Get EY as 64bits, (x1 is used) +#define GETEYSD(a, w, D) \ + if (MODREG) { \ + a = avx_get_reg(dyn, ninst, x1, (nextop & 7) + (rex.b << 3), w, VMX_AVX_WIDTH_128); \ + } else { \ + SMREAD(); \ + addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, NULL, 1, D); \ + a = fpu_get_scratch(dyn); \ + LD(x4, fixedaddress, ed); \ + MTVSRD(VSXREG(a), x4); \ + } + +#define PUTEYSD(a) \ + if (!MODREG) { \ + MFVSRLD(x4, VSXREG(a)); \ + STD(x4, fixedaddress, ed); \ + SMWRITE2(); \ + } + +#define GETGYxy(a, w) \ + if (vex.l) { \ + GETGYy(a, w); \ + } else { \ + GETGYx(a, w); \ + } + +#define GETGYxy_empty(a) \ + if (vex.l) { \ + GETGYy_empty(a); \ + } else { \ + GETGYx_empty(a); \ + } + +#define GETVYxy(a, w) \ + if (vex.l) { \ + GETVYy(a, w); \ + } else { \ + GETVYx(a, w); \ + } + +#define GETVYxy_empty(a) \ + if (vex.l) { \ + GETVYy_empty(a); \ + } else { \ + GETVYx_empty(a); \ + } + +#define GETEYxy(a, w, D) \ + if (vex.l) { \ + GETEYy(a, w, D); \ + } else { \ + GETEYx(a, w, D); \ + } + +#define GETEYxy_empty(a, D) \ + if (vex.l) { \ + GETEYy_empty(a, D); \ + } else { \ + GETEYx_empty(a, D); \ + } + +// Put Back EY if it was a memory and not an emm register +#define PUTEYy(a) \ + if (!MODREG) { \ + /* lower 128 bits; upper 128 must be stored inline by each 256-bit opcode */ \ + STXV(VSXREG(a), fixedaddress, ed); \ + SMWRITE2(); \ + } + +#define PUTEYx(a) \ + if (!MODREG) { \ + STXV(VSXREG(a), fixedaddress, ed); \ + SMWRITE2(); \ + } + +#define PUTEYxy(a) \ + if (vex.l) { \ + PUTEYy(a); \ + } else { \ + PUTEYx(a); \ + } + +// Get empty GY, and non-written VY and EY +#define GETGY_empty_VYEY_xy(gx, vx, ex, D) \ + GETVYxy(vx, 0); \ + GETEYxy(ex, 0, D); \ + GETGYxy_empty(gx); + +// Get empty GY, and non-written EY +#define GETGY_empty_EY_xy(gx, ex, D) \ + GETEYxy(ex, 0, D); \ + GETGYxy_empty(gx); + +// Get writable GY, and non-written VY and EY +#define GETGY_VYEY_xy(gx, vx, ex, D) \ + GETVYxy(vx, 0); \ + GETEYxy(ex, 0, D); \ + GETGYxy(gx, 1); + +// Get writable GY, and non-written EY +#define GETGY_EY_xy(gx, ex, D) \ + GETEYxy(ex, 0, D); \ + GETGYxy(gx, 1); + +// Get writable EY, and non-written VY and GY +#define GETEY_VYGY_xy(ex, vx, gx, D) \ + GETVYxy(vx, 0); \ + GETGYxy(gx, 0); \ + GETEYxy(ex, 1, D); + +// Get writable EY, and non-written GY +#define GETEY_GY_xy(ex, gx, D) \ + GETGYxy(gx, 0); \ + GETEYxy(ex, 1, D); + +// Get writable GYx, and non-written VYx, EYSD or EYSS, for FMA SD/SS insts. +#define GETGYx_VYx_EYxw(gx, vx, ex, D) \ + GETVYx(vx, 0); \ + if (rex.w) { \ + GETEYSD(ex, 0, D) \ + } else { \ + GETEYSS(ex, 0, D); \ + } \ + GETGYx(gx, 1); + +// Get direction with size Z and based of F_DF flag, on register r ready for load/store fetching +// using s as scratch. +#define GETDIR(r, s, Z) \ + MOV32w(r, Z); /* mask=1<<10 */ \ + ANDId(s, xFlags, 1 << F_DF); \ + BEQ(4 + 4); \ + NEG(r, r); + +// CALL will use x6 for the call address. Return value can be put in ret (unless ret is -1) +// R0 will not be pushed/popd if ret is -2 +#define CALL(F, ret, arg1, arg2) call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, 0, 0, 0, 0) +#define CALL4(F, ret, arg1, arg2, arg3, arg4) call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, arg3, arg4, 0, 0) +#define CALL6(F, ret, arg1, arg2, arg3, arg4, arg5, arg6) call_c(dyn, ninst, F, x6, ret, 1, 0, arg1, arg2, arg3, arg4, arg5, arg6) +// CALL_ will use x6 for the call address. Return value can be put in ret (unless ret is -1) +// R0 will not be pushed/popd if ret is -2 +#define CALL_(F, ret, reg, arg1, arg2) call_c(dyn, ninst, F, x6, ret, 1, reg, arg1, arg2, 0, 0, 0, 0) +#define CALL4_(F, ret, reg, arg1, arg2, arg3, arg4) call_c(dyn, ninst, F, x6, ret, 1, reg, arg1, arg2, arg3, arg4, 0, 0) +// CALL_S will use x6 for the call address. Return value can be put in ret (unless ret is -1) +// R0 will not be pushed/popd if ret is -2. Flags are not saved/restored +#define CALL_S(F, ret, arg1) call_c(dyn, ninst, F, x6, ret, 0, 0, arg1, 0, 0, 0, 0, 0) + +// ======================================================================== +// MARK / GETMARK system +// ======================================================================== +#define MARKi(i) dyn->insts[ninst].mark[i] = dyn->native_size +#define GETMARKi(i) dyn->insts[ninst].mark[i] +#define MARK MARKi(0) +#define GETMARK GETMARKi(0) +#define MARK2 MARKi(1) +#define GETMARK2 GETMARKi(1) +#define MARK3 MARKi(2) +#define GETMARK3 GETMARKi(2) + +#define MARKFi(i) dyn->insts[ninst].markf[i] = dyn->native_size +#define GETMARKFi(i) dyn->insts[ninst].markf[i] +#define MARKF MARKFi(0) +#define GETMARKF GETMARKFi(0) +#define MARKF2 MARKFi(1) +#define GETMARKF2 GETMARKFi(1) + +#define MARKSEG dyn->insts[ninst].markseg = dyn->native_size +#define GETMARKSEG dyn->insts[ninst].markseg +#define MARKLOCK dyn->insts[ninst].marklock = dyn->native_size +#define GETMARKLOCK dyn->insts[ninst].marklock +#define MARKLOCK2 dyn->insts[ninst].marklock2 = dyn->native_size +#define GETMARKLOCK2 dyn->insts[ninst].marklock2 + +// ======================================================================== +// Branch-to-mark macros for PPC64LE +// ======================================================================== +// PPC64LE needs compare + branch (2 instructions) for register comparisons, +// but can use direct conditional branches on CR0 after ANDId. +// For reg-reg comparison: CMPD(reg1, reg2); BC(BO, BI, offset) +// The offset must be computed AFTER the compare instruction is emitted. + +// Bxx_gen: compare two registers, then branch +// Uses CMPD_ZR to handle xZR (r0) safely — see note at CMPD_ZR definition +#define Bxx_gen(COND, M, reg1, reg2) \ + CMPD_ZR(reg1, reg2); \ + j64 = GET##M - dyn->native_size; \ + B##COND(j64) + +// BxxZ_gen: compare register with zero, then branch +#define BxxZ_gen(COND, M, reg) \ + CMPDI(reg, 0); \ + j64 = GET##M - dyn->native_size; \ + B##COND(j64) + +// Branch to MARK if reg1!=reg2 (use j64) +#define BNE_MARK(reg1, reg2) Bxx_gen(NE, MARK, reg1, reg2) +// Branch to MARK2 if reg1!=reg2 (use j64) +#define BNE_MARK2(reg1, reg2) Bxx_gen(NE, MARK2, reg1, reg2) +// Branch to MARK3 if reg1!=reg2 (use j64) +#define BNE_MARK3(reg1, reg2) Bxx_gen(NE, MARK3, reg1, reg2) +// Branch to MARKLOCK if reg1!=reg2 (use j64) +#define BNE_MARKLOCK(reg1, reg2) Bxx_gen(NE, MARKLOCK, reg1, reg2) +// Branch to MARKLOCK2 if reg1!=reg2 (use j64) +#define BNE_MARKLOCK2(reg1, reg2) Bxx_gen(NE, MARKLOCK2, reg1, reg2) + +// Branch to MARK if reg1==reg2 (use j64) +#define BEQ_MARK(reg1, reg2) Bxx_gen(EQ, MARK, reg1, reg2) +// Branch to MARK2 if reg1==reg2 (use j64) +#define BEQ_MARK2(reg1, reg2) Bxx_gen(EQ, MARK2, reg1, reg2) +// Branch to MARK3 if reg1==reg2 (use j64) +#define BEQ_MARK3(reg1, reg2) Bxx_gen(EQ, MARK3, reg1, reg2) +// Branch to MARKLOCK if reg1==reg2 (use j64) +#define BEQ_MARKLOCK(reg1, reg2) Bxx_gen(EQ, MARKLOCK, reg1, reg2) +// Branch to MARKLOCK2 if reg1==reg2 (use j64) +#define BEQ_MARKLOCK2(reg1, reg2) Bxx_gen(EQ, MARKLOCK2, reg1, reg2) + +// Branch to MARK if reg1==0 (use j64) +#define BEQZ_MARK(reg) BxxZ_gen(EQ, MARK, reg) +// Branch to MARK2 if reg1==0 (use j64) +#define BEQZ_MARK2(reg) BxxZ_gen(EQ, MARK2, reg) +// Branch to MARK3 if reg1==0 (use j64) +#define BEQZ_MARK3(reg) BxxZ_gen(EQ, MARK3, reg) +// Branch to MARKLOCK if reg1==0 (use j64) +#define BEQZ_MARKLOCK(reg) BxxZ_gen(EQ, MARKLOCK, reg) +// Branch to MARKLOCK2 if reg1==0 (use j64) +#define BEQZ_MARKLOCK2(reg) BxxZ_gen(EQ, MARKLOCK2, reg) + +// Branch to MARK if reg1!=0 (use j64) +#define BNEZ_MARK(reg) BxxZ_gen(NE, MARK, reg) +// Branch to MARK2 if reg1!=0 (use j64) +#define BNEZ_MARK2(reg) BxxZ_gen(NE, MARK2, reg) +// Branch to MARK3 if reg1!=0 (use j64) +#define BNEZ_MARK3(reg) BxxZ_gen(NE, MARK3, reg) +// Branch to MARKLOCK if reg1!=0 (use j64) +#define BNEZ_MARKLOCK(reg) BxxZ_gen(NE, MARKLOCK, reg) +// Branch to MARKLOCK2 if reg1!=0 (use j64) +#define BNEZ_MARKLOCK2(reg) BxxZ_gen(NE, MARKLOCK2, reg) + +// Branch to MARK if reg1native_size; \ + BLT(j64) +// Branch to MARK if reg1>=reg2 (use j64) +#define BGE_MARK(reg1, reg2) Bxx_gen(GE, MARK, reg1, reg2) +// Branch to MARK2 if reg1>=reg2 (use j64) +#define BGE_MARK2(reg1, reg2) Bxx_gen(GE, MARK2, reg1, reg2) +// Branch to MARK3 if reg1>=reg2 (use j64) +#define BGE_MARK3(reg1, reg2) Bxx_gen(GE, MARK3, reg1, reg2) + +// Branch to MARKLOCK on CR0.EQ=0 (for STWCX./STDCX. retry loops) +#define BNE_MARKLOCK_CR0 \ + j64 = GETMARKLOCK - dyn->native_size; \ + BNE(j64) +// Branch to MARKLOCK2 on CR0.EQ=0 (for STWCX./STDCX. retry loops) +#define BNE_MARKLOCK2_CR0 \ + j64 = GETMARKLOCK2 - dyn->native_size; \ + BNE(j64) +// Branch to MARK2 on CR0.EQ=0 (for STWCX./STDCX. retry loops) +#define BNE_MARK2_CR0 \ + j64 = GETMARK2 - dyn->native_size; \ + BNE(j64) + +// Branch to MARK unconditional (use j64) +#define B_MARK_nocond \ + j64 = GETMARK - dyn->native_size; \ + B(j64) +// Branch to MARK2 unconditional (use j64) +#define B_MARK2_nocond \ + j64 = GETMARK2 - dyn->native_size; \ + B(j64) +// Branch to MARK3 unconditional (use j64) +#define B_MARK3_nocond \ + j64 = GETMARK3 - dyn->native_size; \ + B(j64) + +// Branch to NEXT if reg1==0 (use j64) +#define CBZ_NEXT(reg1) \ + CMPDI(reg1, 0); \ + j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \ + BEQ(j64) +// Branch to NEXT if reg1!=0 (use j64) +#define CBNZ_NEXT(reg1) \ + CMPDI(reg1, 0); \ + j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \ + BNE(j64) +#define B_NEXT_nocond \ + j64 = (dyn->insts) ? (dyn->insts[ninst].epilog - (dyn->native_size)) : 0; \ + B(j64) + +// Branch to MARKSEG if reg is 0 (use j64) +#define CBZ_MARKSEG(reg) \ + CMPDI(reg, 0); \ + j64 = GETMARKSEG - (dyn->native_size); \ + BEQ(j64); +// Branch to MARKSEG if reg is not 0 (use j64) +#define CBNZ_MARKSEG(reg) \ + CMPDI(reg, 0); \ + j64 = GETMARKSEG - (dyn->native_size); \ + BNE(j64); + +// ======================================================================== +// IFX / flags macros (undef pass0 versions that use set_flags, +// redefine for passes 1-3 using gen_flags) +// ======================================================================== +#undef IFX +#undef IFX_PENDOR0 +#undef IFXX +#undef IFX2X +#undef IFXN +#define IFX(A) if ((dyn->insts[ninst].x64.gen_flags & (A))) +#define IFXA(A, B) if ((dyn->insts[ninst].x64.gen_flags & (A)) && (B)) +#define IFXORNAT(A) if ((dyn->insts[ninst].x64.gen_flags & (A)) || dyn->insts[ninst].nat_flags_fusion) +#define IFX_PENDOR0 if ((dyn->insts[ninst].x64.gen_flags & (X_PEND) || (!dyn->insts[ninst].x64.gen_flags && !dyn->insts[ninst].nat_flags_fusion))) +#define IFXX(A) if ((dyn->insts[ninst].x64.gen_flags == (A))) +#define IFX2X(A, B) if ((dyn->insts[ninst].x64.gen_flags == (A) || dyn->insts[ninst].x64.gen_flags == (B) || dyn->insts[ninst].x64.gen_flags == ((A) | (B)))) +#define IFXN(A, B) if ((dyn->insts[ninst].x64.gen_flags & (A) && !(dyn->insts[ninst].x64.gen_flags & (B)))) + +#ifndef NATIVE_RESTORE_X87PC +#define NATIVE_RESTORE_X87PC() \ + if (dyn->need_x87check) { \ + LD(x87pc, offsetof(x64emu_t, cw), xEmu); \ + SRDI(x87pc, x87pc, 8); \ + ANDId(x87pc, x87pc, 0b11); \ + } +#endif +#ifndef X87_CHECK_PRECISION +#define X87_CHECK_PRECISION(A) \ + if (!ST_IS_F(0) && dyn->need_x87check) { \ + CMPDI(x87pc, 0); \ + BNE(4 + 4); \ + FRSP(A, A); \ + } +#endif + +// STORE_REG / LOAD_REG — PPC64LE operand order: STD(Rs, offset, Ra) +#define STORE_REG(A) STD(x##A, offsetof(x64emu_t, regs[_##A]), xEmu) +#define LOAD_REG(A) LD(x##A, offsetof(x64emu_t, regs[_##A]), xEmu) + +// Need to also store current value of some register, as they may be used by functions like setjmp +#define STORE_XEMU_CALL() \ + STORE_REG(RBX); \ + STORE_REG(RSP); \ + STORE_REG(RBP); \ + STORE_REG(R10); \ + STORE_REG(R11); \ + STORE_REG(R12); \ + STORE_REG(R13); \ + STORE_REG(R14); \ + STORE_REG(R15); + +#define LOAD_XEMU_CALL() + +#define LOAD_XEMU_REM() \ + LOAD_REG(RBX); \ + LOAD_REG(RSP); \ + LOAD_REG(RBP); \ + LOAD_REG(R10); \ + LOAD_REG(R11); \ + LOAD_REG(R12); \ + LOAD_REG(R13); \ + LOAD_REG(R14); \ + LOAD_REG(R15); + +// PPC64LE has no xZR register that can be stored — must use LI+STW for zero stores +// Use x4 (r6) as scratch instead of x1 (r3) since x1 often holds live 'ed' values +#define FORCE_DFNONE() \ + do { \ + LI(x4, 0); \ + STW(x4, offsetof(x64emu_t, df), xEmu); \ + } while (0) + +#define CHECK_DFNONE(N) \ + do { \ + if (dyn->f == status_none_pending) { \ + FORCE_DFNONE(); \ + if (N) dyn->f = status_none; \ + } \ + } while (0) + +#define SET_DFNONE() \ + do { \ + if (dyn->f != status_none) { \ + dyn->f = status_none_pending; \ + } \ + } while (0) + +#define SET_DF(S, N) \ + if ((N) != d_none) { \ + MOV32w(S, (N)); \ + STW(S, offsetof(x64emu_t, df), xEmu); \ + dyn->f = status_set; \ + } else \ + SET_DFNONE() + +#define CLEAR_FLAGS_(s) \ + MOV64x(s, (1UL << F_AF) | (1UL << F_CF) | (1UL << F_OF) | (1UL << F_ZF) | (1UL << F_SF) | (1UL << F_PF)); \ + ANDC(xFlags, xFlags, s); + +#define CLEAR_FLAGS(s) \ + IFX (X_ALL) { CLEAR_FLAGS_(s) } + +#define CALC_SUB_FLAGS(op1_, op2, res, scratch1, scratch2, width) \ + IFX (X_AF | X_CF | X_OF) { \ + /* calc borrow chain */ \ + /* bc = (res & (~op1 | op2)) | (~op1 & op2) */ \ + OR(scratch1, op1_, op2); \ + AND(scratch2, res, scratch1); \ + AND(op1_, op1_, op2); \ + OR(scratch2, scratch2, op1_); \ + IFX (X_AF) { \ + ANDId(scratch1, scratch2, 8); \ + SLDI(scratch1, scratch1, F_AF - 3); \ + OR(xFlags, xFlags, scratch1); \ + } \ + IFX (X_CF) { \ + /* cf = bc & (1<<(width-1)) */ \ + if ((width) == 8) { \ + ANDId(scratch1, scratch2, 0x80); \ + } else { \ + SRDI(scratch1, scratch2, (width) - 1); \ + if ((width) != 64) ANDId(scratch1, scratch1, 1); \ + } \ + CMPDI(scratch1, 0); \ + BEQ(8); \ + ORI(xFlags, xFlags, 1 << F_CF); \ + } \ + IFX (X_OF) { \ + /* of = ((bc >> (width-2)) ^ (bc >> (width-1))) & 0x1; */ \ + SRDI(scratch1, scratch2, (width) - 2); \ + SRDI(scratch2, scratch1, 1); \ + XOR(scratch1, scratch1, scratch2); \ + ANDId(scratch1, scratch1, 1); \ + CMPDI(scratch1, 0); \ + BEQ(8); \ + ORI(xFlags, xFlags, 1 << F_OF); \ + } \ + } + +// ======================================================================== +// X87 push/pop macros +// ======================================================================== +#if STEP == 0 +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) var = x87_do_push(dyn, ninst, scratch, t) +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) x87_do_push_empty(dyn, ninst, scratch) +#define X87_POP_OR_FAIL(dyn, ninst, scratch) x87_do_pop(dyn, ninst, scratch) +#else +#define X87_PUSH_OR_FAIL(var, dyn, ninst, scratch, t) \ + if ((dyn->v.x87stack == 8) || (dyn->v.pushed == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->v.x87stack, dyn->v.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + var = x87_do_push(dyn, ninst, scratch, t); + +#define X87_PUSH_EMPTY_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->v.x87stack == 8) || (dyn->v.pushed == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Push, stack=%d/%d on inst %d\n", dyn->v.x87stack, dyn->v.pushed, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_push_empty(dyn, ninst, scratch); + +#define X87_POP_OR_FAIL(dyn, ninst, scratch) \ + if ((dyn->v.x87stack == -8) || (dyn->v.poped == 8)) { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, " Warning, suspicious x87 Pop, stack=%d/%d on inst %d\n", dyn->v.x87stack, dyn->v.poped, ninst); \ + dyn->abort = 1; \ + return addr; \ + } \ + x87_do_pop(dyn, ninst, scratch); +#endif + +// ======================================================================== +// READFLAGS / SETFLAGS / GRABFLAGS +// ======================================================================== +#ifndef READFLAGS +#define READFLAGS(A) \ + if ((A) != X_PEND \ + && (dyn->f == status_unk)) { \ + CALL_(const_updateflags, -1, 0, 0, 0); \ + dyn->f = status_none; \ + } +#endif + +#ifndef READFLAGS_FUSION +#define READFLAGS_FUSION(A, s1, s2, s3, s4, s5) \ + if (dyn->insts[ninst].nat_flags_fusion) \ + get_free_scratch(dyn, ninst, &tmp1, &tmp2, &tmp3, s1, s2, s3, s4, s5); \ + else { \ + tmp1 = s1; \ + tmp2 = s2; \ + tmp3 = s3; \ + } \ + READFLAGS(A) +#endif + +#define NAT_FLAGS_OPS(op1, op2, s1, s2) \ + do { \ + dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op1 = op1; \ + dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op2 = op2; \ + if (dyn->insts[ninst + 1].no_scratch_usage && IS_GPR(op1)) { \ + MV(s1, op1); \ + dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op1 = s1; \ + } \ + if (dyn->insts[ninst + 1].no_scratch_usage && IS_GPR(op2)) { \ + MV(s2, op2); \ + dyn->insts[dyn->insts[ninst].nat_next_inst].nat_flags_op2 = s2; \ + } \ + } while (0) + +#define NAT_FLAGS_ENABLE_CARRY() dyn->insts[ninst].nat_flags_carry = 1 +#define NAT_FLAGS_ENABLE_SIGN() dyn->insts[ninst].nat_flags_sign = 1 +#define NAT_FLAGS_ENABLE_SF() dyn->insts[ninst].nat_flags_sf = 1 + +#define GRABFLAGS(A) \ + if ((A) != X_PEND \ + && ((dyn->f == status_unk) || (dyn->f == status_set))) { \ + CALL_(const_updateflags, -1, 0, 0, 0); \ + dyn->f = status_none; \ + } + +#ifndef SETFLAGS +#define SETFLAGS(A, B, FUSION) \ + do { \ + if (((B) & SF_SUB) \ + && (dyn->insts[ninst].x64.gen_flags & (~(A)))) \ + GRABFLAGS(((dyn->insts[ninst].x64.gen_flags & X_PEND) ? X_ALL : dyn->insts[ninst].x64.gen_flags) & (~(A))); \ + if (dyn->insts[ninst].x64.gen_flags) switch (B) { \ + case SF_SET_DF: dyn->f = status_set; break; \ + case SF_SET_NODF: SET_DFNONE(); break; \ + case SF_SUBSET: \ + case SF_SUBSET_PENDING: \ + case SF_SET: \ + case SF_PENDING: \ + case SF_SET_PENDING: \ + SET_DFNONE(); \ + break; \ + } \ + else \ + SET_DFNONE(); \ + dyn->insts[ninst].nat_flags_nofusion = (FUSION); \ + } while (0) +#endif +#ifndef JUMP +#define JUMP(A, C) +#endif +#ifndef BARRIER +#define BARRIER(A) +#endif +#ifndef SET_HASCALLRET +#define SET_HASCALLRET() +#endif + +// UFLAG macros +#define UFLAG_OP1(A) \ + if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op1)); } +#define UFLAG_OP2(A) \ + if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, op2)); } +#define UFLAG_OP12(A1, A2) \ + if (dyn->insts[ninst].x64.gen_flags) { \ + SDxw(A1, xEmu, offsetof(x64emu_t, op1)); \ + SDxw(A2, xEmu, offsetof(x64emu_t, op2)); \ + } +#define UFLAG_RES(A) \ + if (dyn->insts[ninst].x64.gen_flags) { SDxw(A, xEmu, offsetof(x64emu_t, res)); } +#define UFLAG_DF(r, A) \ + if (dyn->insts[ninst].x64.gen_flags) { SET_DF(r, A); } +#undef UFLAG_IF +#undef UFLAG_IF2 +#define UFLAG_IF if (dyn->insts[ninst].x64.gen_flags) +#ifndef DEFAULT +#define DEFAULT \ + *ok = -1; \ + BARRIER(2) +#endif + +#ifndef CALLRET_LOOP +#define CALLRET_LOOP() NOP() +#endif + +#ifndef TABLE64 +#define TABLE64(A, V) +#endif +#ifndef FTABLE64 +#define FTABLE64(A, V) +#endif +#ifndef TABLE64C +#define TABLE64C(A, V) +#endif + +#ifndef TABLE64_ +#define TABLE64_(A, V) +#endif + +#define ARCH_INIT() SMSTART() + +#define ARCH_RESET() + +// ======================================================================== +// GETIP +// ======================================================================== +#if STEP < 2 +#define GETIP(A, scratch) +#define GETIP_(A, scratch) +#else + +// PPC64LE ADDI has 16-bit signed immediate (±32768), wider than LA64's 12-bit +#define GETIP_(A, scratch) \ + do { \ + ssize_t _delta_ip = (ssize_t)(A) - (ssize_t)dyn->last_ip; \ + if (!dyn->last_ip) { \ + if (dyn->need_reloc) { \ + TABLE64(xRIP, (A)); \ + } else { \ + MOV64x(xRIP, (A)); \ + } \ + } else if (_delta_ip == 0) { \ + } else if (_delta_ip >= -32768 && _delta_ip < 32768) { \ + ADDI(xRIP, xRIP, _delta_ip); \ + } else if (_delta_ip < 0 && _delta_ip >= -0xffffffffL) { \ + MOV32w(scratch, -_delta_ip); \ + SUB(xRIP, xRIP, scratch); \ + } else if (_delta_ip > 0 && _delta_ip <= 0xffffffffL) { \ + MOV32w(scratch, _delta_ip); \ + ADD(xRIP, xRIP, scratch); \ + } else { \ + if (dyn->need_reloc) { \ + TABLE64(xRIP, (A)); \ + } else { \ + MOV64x(xRIP, (A)); \ + } \ + } \ + } while (0) +#define GETIP(A, scratch) \ + GETIP_(A, scratch); \ + dyn->last_ip = (A); + +#endif +#define CLEARIP() dyn->last_ip = 0 + +// ======================================================================== +// PASS2IF / MODREG +// ======================================================================== +#if STEP < 2 +#define PASS2IF(A, B) if (A) +#elif STEP == 2 +#define PASS2IF(A, B) \ + if (A) dyn->insts[ninst].pass2choice = B; \ + if (dyn->insts[ninst].pass2choice == B) +#else +#define PASS2IF(A, B) if (dyn->insts[ninst].pass2choice == B) +#endif + +#define MODREG ((nextop & 0xC0) == 0xC0) + +// ======================================================================== +// STEPNAME mangling +// ======================================================================== +#ifndef STEPNAME +#define STEPNAME3(N, M) N##M +#define STEPNAME2(N, M) STEPNAME3(N, M) +#define STEPNAME(N) STEPNAME2(N, STEP) +#endif + +#define native_pass STEPNAME(native_pass) + +#define dynarec64_00 STEPNAME(dynarec64_00) +#define dynarec64_0F STEPNAME(dynarec64_0F) +#define dynarec64_66 STEPNAME(dynarec64_66) +#define dynarec64_F30F STEPNAME(dynarec64_F30F) +#define dynarec64_660F STEPNAME(dynarec64_660F) +#define dynarec64_66F0 STEPNAME(dynarec64_66F0) +#define dynarec64_66F20F STEPNAME(dynarec64_66F20F) +#define dynarec64_66F30F STEPNAME(dynarec64_66F30F) +#define dynarec64_F0 STEPNAME(dynarec64_F0) +#define dynarec64_F20F STEPNAME(dynarec64_F20F) +#define dynarec64_AVX STEPNAME(dynarec64_AVX) +#define dynarec64_AVX_0F STEPNAME(dynarec64_AVX_0F) +#define dynarec64_AVX_0F38 STEPNAME(dynarec64_AVX_0F38) +#define dynarec64_AVX_66_0F STEPNAME(dynarec64_AVX_66_0F) +#define dynarec64_AVX_66_0F38 STEPNAME(dynarec64_AVX_66_0F38) +#define dynarec64_AVX_66_0F3A STEPNAME(dynarec64_AVX_66_0F3A) +#define dynarec64_AVX_F2_0F STEPNAME(dynarec64_AVX_F2_0F) +#define dynarec64_AVX_F2_0F38 STEPNAME(dynarec64_AVX_F2_0F38) +#define dynarec64_AVX_F2_0F3A STEPNAME(dynarec64_AVX_F2_0F3A) +#define dynarec64_AVX_F3_0F STEPNAME(dynarec64_AVX_F3_0F) +#define dynarec64_AVX_F3_0F38 STEPNAME(dynarec64_AVX_F3_0F38) + +#define dynarec64_D8 STEPNAME(dynarec64_D8) +#define dynarec64_D9 STEPNAME(dynarec64_D9) +#define dynarec64_DA STEPNAME(dynarec64_DA) +#define dynarec64_DB STEPNAME(dynarec64_DB) +#define dynarec64_DC STEPNAME(dynarec64_DC) +#define dynarec64_DD STEPNAME(dynarec64_DD) +#define dynarec64_DE STEPNAME(dynarec64_DE) +#define dynarec64_DF STEPNAME(dynarec64_DF) +#define dynarec64_F0 STEPNAME(dynarec64_F0) + +#define geted STEPNAME(geted) +#define geted16 STEPNAME(geted16) +#define jump_to_epilog STEPNAME(jump_to_epilog) +#define jump_to_epilog_fast STEPNAME(jump_to_epilog_fast) +#define jump_to_next STEPNAME(jump_to_next) +#define ret_to_next STEPNAME(ret_to_next) +#define iret_to_next STEPNAME(iret_to_next) +#define call_c STEPNAME(call_c) +#define call_n STEPNAME(call_n) +#define grab_segdata STEPNAME(grab_segdata) +#define emit_adc16 STEPNAME(emit_adc16) +#define emit_adc32 STEPNAME(emit_adc32) +#define emit_adc8 STEPNAME(emit_adc8) +#define emit_adc8c STEPNAME(emit_adc8c) +#define emit_add16 STEPNAME(emit_add16) +#define emit_add32 STEPNAME(emit_add32) +#define emit_add32c STEPNAME(emit_add32c) +#define emit_add8 STEPNAME(emit_add8) +#define emit_add8c STEPNAME(emit_add8c) +#define emit_and16 STEPNAME(emit_and16) +#define emit_and32 STEPNAME(emit_and32) +#define emit_and32c STEPNAME(emit_and32c) +#define emit_and8 STEPNAME(emit_and8) +#define emit_and8c STEPNAME(emit_and8c) +#define emit_cmp16 STEPNAME(emit_cmp16) +#define emit_cmp16_0 STEPNAME(emit_cmp16_0) +#define emit_cmp32 STEPNAME(emit_cmp32) +#define emit_cmp32_0 STEPNAME(emit_cmp32_0) +#define emit_cmp8 STEPNAME(emit_cmp8) +#define emit_cmp8_0 STEPNAME(emit_cmp8_0) +#define emit_dec16 STEPNAME(emit_dec16) +#define emit_dec32 STEPNAME(emit_dec32) +#define emit_dec8 STEPNAME(emit_dec8) +#define emit_inc16 STEPNAME(emit_inc16) +#define emit_inc32 STEPNAME(emit_inc32) +#define emit_inc8 STEPNAME(emit_inc8) +#define emit_neg16 STEPNAME(emit_neg16) +#define emit_neg32 STEPNAME(emit_neg32) +#define emit_neg8 STEPNAME(emit_neg8) +#define emit_or16 STEPNAME(emit_or16) +#define emit_or32 STEPNAME(emit_or32) +#define emit_or32c STEPNAME(emit_or32c) +#define emit_or8 STEPNAME(emit_or8) +#define emit_or8c STEPNAME(emit_or8c) +#define emit_rcl16 STEPNAME(emit_rcl16) +#define emit_rcl16c STEPNAME(emit_rcl16c) +#define emit_rcl32 STEPNAME(emit_rcl32) +#define emit_rcl32c STEPNAME(emit_rcl32c) +#define emit_rcl8 STEPNAME(emit_rcl8) +#define emit_rcl8c STEPNAME(emit_rcl8c) +#define emit_rcr16 STEPNAME(emit_rcr16) +#define emit_rcr16c STEPNAME(emit_rcr16c) +#define emit_rcr32 STEPNAME(emit_rcr32) +#define emit_rcr32c STEPNAME(emit_rcr32c) +#define emit_rcr8 STEPNAME(emit_rcr8) +#define emit_rcr8c STEPNAME(emit_rcr8c) +#define emit_rol16 STEPNAME(emit_rol16) +#define emit_rol16c STEPNAME(emit_rol16c) +#define emit_rol32 STEPNAME(emit_rol32) +#define emit_rol32c STEPNAME(emit_rol32c) +#define emit_rol8 STEPNAME(emit_rol8) +#define emit_rol8c STEPNAME(emit_rol8c) +#define emit_ror16 STEPNAME(emit_ror16) +#define emit_ror16c STEPNAME(emit_ror16c) +#define emit_ror32 STEPNAME(emit_ror32) +#define emit_ror32c STEPNAME(emit_ror32c) +#define emit_ror8 STEPNAME(emit_ror8) +#define emit_ror8c STEPNAME(emit_ror8c) +#define emit_sar16 STEPNAME(emit_sar16) +#define emit_sar16c STEPNAME(emit_sar16c) +#define emit_sar32 STEPNAME(emit_sar32) +#define emit_sar32c STEPNAME(emit_sar32c) +#define emit_sar8 STEPNAME(emit_sar8) +#define emit_sar8c STEPNAME(emit_sar8c) +#define emit_sbb16 STEPNAME(emit_sbb16) +#define emit_sbb32 STEPNAME(emit_sbb32) +#define emit_sbb8 STEPNAME(emit_sbb8) +#define emit_sbb8c STEPNAME(emit_sbb8c) +#define emit_shl16 STEPNAME(emit_shl16) +#define emit_shl16c STEPNAME(emit_shl16c) +#define emit_shl32 STEPNAME(emit_shl32) +#define emit_shl32c STEPNAME(emit_shl32c) +#define emit_shl8 STEPNAME(emit_shl8) +#define emit_shl8c STEPNAME(emit_shl8c) +#define emit_shld16 STEPNAME(emit_shld16) +#define emit_shld16c STEPNAME(emit_shld16c) +#define emit_shld32 STEPNAME(emit_shld32) +#define emit_shld32c STEPNAME(emit_shld32c) +#define emit_shr16 STEPNAME(emit_shr16) +#define emit_shr16c STEPNAME(emit_shr16c) +#define emit_shr32 STEPNAME(emit_shr32) +#define emit_shr32c STEPNAME(emit_shr32c) +#define emit_shr8 STEPNAME(emit_shr8) +#define emit_shr8c STEPNAME(emit_shr8c) +#define emit_shrd16 STEPNAME(emit_shrd16) +#define emit_shrd16c STEPNAME(emit_shrd16c) +#define emit_shrd32 STEPNAME(emit_shrd32) +#define emit_shrd32c STEPNAME(emit_shrd32c) +#define emit_sub16 STEPNAME(emit_sub16) +#define emit_sub32 STEPNAME(emit_sub32) +#define emit_sub32c STEPNAME(emit_sub32c) +#define emit_sub8 STEPNAME(emit_sub8) +#define emit_sub8c STEPNAME(emit_sub8c) +#define emit_test16 STEPNAME(emit_test16) +#define emit_test32 STEPNAME(emit_test32) +#define emit_test32c STEPNAME(emit_test32c) +#define emit_test8 STEPNAME(emit_test8) +#define emit_test8c STEPNAME(emit_test8c) +#define emit_xor16 STEPNAME(emit_xor16) +#define emit_xor32 STEPNAME(emit_xor32) +#define emit_xor32c STEPNAME(emit_xor32c) +#define emit_xor8 STEPNAME(emit_xor8) +#define emit_xor8c STEPNAME(emit_xor8c) + +#define emit_pf STEPNAME(emit_pf) + +#define x87_do_push STEPNAME(x87_do_push) +#define x87_do_push_empty STEPNAME(x87_do_push_empty) +#define x87_do_pop STEPNAME(x87_do_pop) +#define x87_get_current_cache STEPNAME(x87_get_current_cache) +#define x87_get_cache STEPNAME(x87_get_cache) +#define x87_get_vmxcache STEPNAME(x87_get_vmxcache) +#define x87_get_st STEPNAME(x87_get_st) +#define x87_get_st_empty STEPNAME(x87_get_st) +#define x87_free STEPNAME(x87_free) +#define x87_refresh STEPNAME(x87_refresh) +#define x87_forget STEPNAME(x87_forget) +#define x87_reget_st STEPNAME(x87_reget_st) +#define x87_stackcount STEPNAME(x87_stackcount) +#define x87_unstackcount STEPNAME(x87_unstackcount) +#define x87_swapreg STEPNAME(x87_swapreg) +#define x87_setround STEPNAME(x87_setround) +#define x87_restoreround STEPNAME(x87_restoreround) +#define x87_reflectcount STEPNAME(x87_reflectcount) +#define x87_unreflectcount STEPNAME(x87_unreflectcount) +#define x87_purgecache STEPNAME(x87_purgecache) + +#define sse_setround STEPNAME(sse_setround) +#define mmx_get_reg STEPNAME(mmx_get_reg) +#define mmx_get_reg_empty STEPNAME(mmx_get_reg_empty) +#define sse_purge07cache STEPNAME(sse_purge07cache) +#define sse_get_reg STEPNAME(sse_get_reg) +#define sse_get_reg_empty STEPNAME(sse_get_reg_empty) +#define sse_forget_reg STEPNAME(sse_forget_reg) +#define sse_reflect_reg STEPNAME(sse_reflect_reg) + +#define avx_get_reg STEPNAME(avx_get_reg) +#define avx_get_reg_empty STEPNAME(avx_get_reg_empty) +#define avx_forget_reg STEPNAME(sse_forget_reg) +#define avx_reflect_reg STEPNAME(avx_reflect_reg) +#define avx_purgecache STEPNAME(avx_purgecache) +#define avx_reflect_reg_upper128 STEPNAME(avx_reflect_reg_upper128) + + +#define fpu_pushcache STEPNAME(fpu_pushcache) +#define fpu_popcache STEPNAME(fpu_popcache) +#define fpu_reset_cache STEPNAME(fpu_reset_cache) +#define fpu_propagate_stack STEPNAME(fpu_propagate_stack) +#define fpu_purgecache STEPNAME(fpu_purgecache) +#define mmx_purgecache STEPNAME(mmx_purgecache) +#define fpu_reflectcache STEPNAME(fpu_reflectcache) +#define fpu_unreflectcache STEPNAME(fpu_unreflectcache) + +#define CacheTransform STEPNAME(CacheTransform) +#define ppc64le_move64 STEPNAME(ppc64le_move64) +#define ppc64le_move32 STEPNAME(ppc64le_move32) + +// ======================================================================== +// Function declarations +// ======================================================================== + +/* setup r2 to address pointed by */ +uintptr_t geted(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int* l, int i12, int delta); + +/* setup r2 to address pointed by */ +uintptr_t geted16(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, uint8_t nextop, uint8_t* ed, uint8_t hint, uint8_t scratch, int64_t* fixaddress, rex_t rex, int i12); + +// generic x64 helper +void jump_to_epilog(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst); +void jump_to_epilog_fast(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst); +void jump_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int reg, int ninst, int is32bits); +void ret_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int ninst, rex_t rex); +void iret_to_next(dynarec_ppc64le_t* dyn, uintptr_t ip, int ninst, int is32bits, int is64bits); +void call_c(dynarec_ppc64le_t* dyn, int ninst, ppc64le_consts_t fnc, int reg, int ret, int saveflags, int save_reg, int arg1, int arg2, int arg3, int arg4, int arg5, int arg6); +void call_n(dynarec_ppc64le_t* dyn, int ninst, void* fnc, int w); +void grab_segdata(dynarec_ppc64le_t* dyn, uintptr_t addr, int ninst, int reg, int segment); +void emit_adc16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_adc32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_adc8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_adc8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6); +void emit_add16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_add32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_add32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5); +void emit_add8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_add8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4); +void emit_and16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_and32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_and32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4); +void emit_and8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_and8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s3, int s4); +void emit_cmp16_0(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4); +void emit_cmp16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_cmp32_0(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, uint8_t nextop, int s1, int s3, int s4, int s5); +void emit_cmp32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_cmp8_0(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4); +void emit_cmp8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_dec16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_dec32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_dec8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_inc16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_inc32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_inc8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_neg16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4); +void emit_neg32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3); +void emit_neg8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4); +void emit_or16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_or32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_or32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4); +void emit_or8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_or8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4); +void emit_rcl16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_rcl16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_rcl32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_rcl32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4, int s5); +void emit_rcl8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_rcl8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_rcr16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_rcr16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_rcr32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_rcr32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4, int s5); +void emit_rcr8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_rcr8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_rol16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_rol16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4, int s5); +void emit_rol32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_rol32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4); +void emit_rol8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_rol8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4, int s5); +void emit_ror16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_ror16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_ror32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_ror32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4); +void emit_ror8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_ror8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_sar16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sar16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4, int s5); +void emit_sar32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_sar32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4); +void emit_sar8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sar8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_sbb16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sbb32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_sbb8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sbb8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s3, int s4, int s5, int s6); +void emit_shl16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_shl16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4, int s5); +void emit_shl32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_shl32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4, int s5); +void emit_shl8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_shl8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_shld16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_shld16c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4, int s5); +void emit_shld32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s5, int s3, int s4, int s6); +void emit_shld32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4); +void emit_shr16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_shr16c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4, int s5); +void emit_shr32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_shr32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, uint32_t c, int s3, int s4); +void emit_shr8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_shr8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint32_t c, int s3, int s4); +void emit_shrd16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5, int s6); +void emit_shrd16c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4, int s5); +void emit_shrd32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s5, int s3, int s4, int s6); +void emit_shrd32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, uint32_t c, int s3, int s4); +void emit_sub16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sub32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_sub32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s2, int s3, int s4, int s5); +void emit_sub8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_sub8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s2, int s3, int s4, int s5); +void emit_test16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_test32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4, int s5); +void emit_test32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4, int s5); +void emit_test8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_test8c(dynarec_ppc64le_t* dyn, int ninst, int s1, uint8_t c, int s3, int s4, int s5); +void emit_xor16(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4, int s5); +void emit_xor32(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int s2, int s3, int s4); +void emit_xor32c(dynarec_ppc64le_t* dyn, int ninst, rex_t rex, int s1, int64_t c, int s3, int s4); +void emit_xor8(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int s4); +void emit_xor8c(dynarec_ppc64le_t* dyn, int ninst, int s1, int32_t c, int s3, int s4); + +void emit_pf(dynarec_ppc64le_t* dyn, int ninst, int s1, int s3, int s4); + +// common coproc helpers + +// x87 helper +// cache of the local stack counter, to avoid update at every call +int x87_stackcount(dynarec_ppc64le_t* dyn, int ninst, int scratch); +// restore local stack counter +void x87_unstackcount(dynarec_ppc64le_t* dyn, int ninst, int scratch, int count); +// fpu push. Return the Dd value to be used +int x87_do_push(dynarec_ppc64le_t* dyn, int ninst, int s1, int t); +// fpu push. Do not allocate a cache register. Needs a scratch register to do x87stack synch (or 0 to not do it) +void x87_do_push_empty(dynarec_ppc64le_t* dyn, int ninst, int s1); +// fpu pop. All previous returned Dd should be considered invalid +void x87_do_pop(dynarec_ppc64le_t* dyn, int ninst, int s1); +// get cache index for a x87 reg, return -1 if cache doesn't exist +int x87_get_current_cache(dynarec_ppc64le_t* dyn, int ninst, int st, int t); +// get cache index for a x87 reg, create the entry if needed +int x87_get_cache(dynarec_ppc64le_t* dyn, int ninst, int populate, int s1, int s2, int a, int t); +// get extcache index for a x87 reg +int x87_get_vmxcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a); +// get vfpu register for a x87 reg, create the entry if needed +int x87_get_st(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int t); +// get vfpu register for a x87 reg, create the entry if needed. Do not fetch the Stx if not already in cache +int x87_get_st_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int t); +// Free st, using the FFREE opcode (so it's freed but stack is not moved) +void x87_free(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int st); +// refresh a value from the cache ->emu (nothing done if value is not cached) +void x87_refresh(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st); +// refresh a value from the cache ->emu and then forget the cache (nothing done if value is not cached) +void x87_forget(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st); +// refresh the cache value from emu +void x87_reget_st(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int st); +// swap 2 x87 regs +void x87_swapreg(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int a, int b); +// Set rounding according to cw flags, return reg to restore flags +int x87_setround(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2); +// Restore round flag +void x87_restoreround(dynarec_ppc64le_t* dyn, int ninst, int s1); +// Set rounding according to mxcsr flags, return reg to restore flags +void x87_reflectcount(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2); +void x87_unreflectcount(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2); +void x87_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1, int s2, int s3); + +// reset the cache with n +void fpu_reset_cache(dynarec_ppc64le_t* dyn, int ninst, int reset_n); +void fpu_propagate_stack(dynarec_ppc64le_t* dyn, int ninst); +void fpu_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1, int s2, int s3); +void mmx_purgecache(dynarec_ppc64le_t* dyn, int ninst, int next, int s1); +void fpu_reflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3); +void fpu_unreflectcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3); +void fpu_pushcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int not07); +void fpu_popcache(dynarec_ppc64le_t* dyn, int ninst, int s1, int not07); +// Set rounding according to mxcsr flags, return reg to restore flags +int sse_setround(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2); + +// SSE/SSE2 helpers +// purge the XMM0..XMM7 cache (before function call) +void sse_purge07cache(dynarec_ppc64le_t* dyn, int ninst, int s1); +// get vmx register for a SSE reg, create the entry if needed +int sse_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int forwrite); +// get vmx register for an SSE reg, but don't try to synch it if it needed to be created +int sse_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int a); +// forget float register for a SSE reg, create the entry if needed +void sse_forget_reg(dynarec_ppc64le_t* dyn, int ninst, int a); +// Push current value to the cache +void sse_reflect_reg(dynarec_ppc64le_t* dyn, int ninst, int a); + +// MMX helpers +// get vmx register for a MMX reg, create the entry if needed +int mmx_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int a); +// get vmx register for a MMX reg, but don't try to synch it if it needed to be created +int mmx_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int s2, int s3, int a); + + +// AVX helpers +// get vmx register for a AVX reg, create the entry if needed +int avx_get_reg(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int forwrite, int width); +// get vmx register for an AVX reg, but don't try to synch it if it needed to be created +int avx_get_reg_empty(dynarec_ppc64le_t* dyn, int ninst, int s1, int a, int width); +// forget float register for a AVX reg, create the entry if needed +void avx_forget_reg(dynarec_ppc64le_t* dyn, int ninst, int a); +// Push current value to the cache +void avx_reflect_reg(dynarec_ppc64le_t* dyn, int ninst, int a); +void avx_reflect_reg_upper128(dynarec_ppc64le_t* dyn, int ninst, int a, int forwrite); + +void CacheTransform(dynarec_ppc64le_t* dyn, int ninst, int cacheupd, int s1, int s2, int s3); + +void ppc64le_move64(dynarec_ppc64le_t* dyn, int ninst, int reg, int64_t val); +void ppc64le_move32(dynarec_ppc64le_t* dyn, int ninst, int reg, int32_t val, int zeroup); + +// ======================================================================== +// CHECK_CACHE +// ======================================================================== +#if STEP < 2 +#define CHECK_CACHE() 0 +#else +#define CHECK_CACHE() (cacheupd = CacheNeedsTransform(dyn, ninst)) +#endif + +#define vmxcache_st_coherency STEPNAME(vmxcache_st_coherency) +int vmxcache_st_coherency(dynarec_ppc64le_t* dyn, int ninst, int a, int b); + +// ======================================================================== +// ST_IS_F / X87_COMBINE / X87_ST (using dyn->v for VMX cache) +// ======================================================================== +#if STEP == 0 +#define ST_IS_F(A) 0 +#define ST_IS_I64(A) 0 +#define X87_COMBINE(A, B) VMX_CACHE_ST_D +#define X87_ST0 VMX_CACHE_ST_D +#define X87_ST(A) VMX_CACHE_ST_D +#elif STEP == 1 +#define ST_IS_F(A) (vmxcache_get_current_st(dyn, ninst, A) == VMX_CACHE_ST_F) +#define ST_IS_I64(A) (vmxcache_get_current_st(dyn, ninst, A) == VMX_CACHE_ST_I64) +#define X87_COMBINE(A, B) vmxcache_combine_st(dyn, ninst, A, B) +#define X87_ST0 vmxcache_no_i64(dyn, ninst, 0, vmxcache_get_current_st(dyn, ninst, 0)) +#define X87_ST(A) vmxcache_no_i64(dyn, ninst, A, vmxcache_get_current_st(dyn, ninst, A)) +#else +#define ST_IS_F(A) (vmxcache_get_st(dyn, ninst, A) == VMX_CACHE_ST_F) +#define ST_IS_I64(A) (vmxcache_get_st(dyn, ninst, A) == VMX_CACHE_ST_I64) +#if STEP == 3 +#define X87_COMBINE(A, B) vmxcache_st_coherency(dyn, ninst, A, B) +#else +#define X87_COMBINE(A, B) vmxcache_get_st(dyn, ninst, A) +#endif +#define X87_ST0 vmxcache_get_st(dyn, ninst, 0) +#define X87_ST(A) vmxcache_get_st(dyn, ninst, A) +#endif + +// ======================================================================== +// Opcode decoder declarations +// ======================================================================== +uintptr_t dynarec64_00(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_F30F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_64(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); +uintptr_t dynarec64_66(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_6664(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); +uintptr_t dynarec64_67(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_6764(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int seg, int* ok, int* need_epilog); +uintptr_t dynarec64_660F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_66F0(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_66F20F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_66F30F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_F0(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_F20F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_66_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_66_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_66_0F3A(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F2_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F2_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F2_0F3A(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F3_0F(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_AVX_F3_0F38(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, vex_t vex, int* ok, int* need_epilog); +uintptr_t dynarec64_D8(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_D9(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DA(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DB(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DC(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DD(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DE(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); +uintptr_t dynarec64_DF(dynarec_ppc64le_t* dyn, uintptr_t addr, uintptr_t ip, int ninst, rex_t rex, int* ok, int* need_epilog); + + +// ======================================================================== +// PASS3 +// ======================================================================== +#if STEP < 3 +#define PASS3(A) +#else +#define PASS3(A) A +#endif + +#if STEP < 3 +#define MAYUSE(A) (void)A +#else +#define MAYUSE(A) +#endif + +// ======================================================================== +// GOCOND — conditional jump/set/cmov dispatch +// ======================================================================== +// PPC64LE adaptation: SRLI_D → SRDI, ANDI → ANDId (which sets CR0) +// ANDId always sets CR0, so we can branch on it directly. +// But for the GO() macro pattern, we keep using BxxZ_gen / CBNZ etc. +// as the GO() caller provides the branch decision. +#define GOCOND(B, T1, T2) \ + case B + 0x0: \ + INST_NAME(T1 "O " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_OF), EQZ, NEZ, _, _, X_OF, X64_JMP_JO); \ + break; \ + case B + 0x1: \ + INST_NAME(T1 "NO " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_OF), NEZ, EQZ, _, _, X_OF, X64_JMP_JNO); \ + break; \ + case B + 0x2: \ + INST_NAME(T1 "C " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_CF), EQZ, NEZ, GEU, LTU, X_CF, X64_JMP_JC); \ + break; \ + case B + 0x3: \ + INST_NAME(T1 "NC " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_CF), NEZ, EQZ, LTU, GEU, X_CF, X64_JMP_JNC); \ + break; \ + case B + 0x4: \ + INST_NAME(T1 "Z " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_ZF), EQZ, NEZ, NE, EQ, X_ZF, X64_JMP_JZ); \ + break; \ + case B + 0x5: \ + INST_NAME(T1 "NZ " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_ZF), NEZ, EQZ, EQ, NE, X_ZF, X64_JMP_JNZ); \ + break; \ + case B + 0x6: \ + INST_NAME(T1 "BE " T2); \ + GO(ANDId(tmp1, xFlags, (1 << F_CF) | (1 << F_ZF)), EQZ, NEZ, GTU, LEU, X_CF | X_ZF, X64_JMP_JBE); \ + break; \ + case B + 0x7: \ + INST_NAME(T1 "NBE " T2); \ + GO(ANDId(tmp1, xFlags, (1 << F_CF) | (1 << F_ZF)), NEZ, EQZ, LEU, GTU, X_CF | X_ZF, X64_JMP_JNBE); \ + break; \ + case B + 0x8: \ + INST_NAME(T1 "S " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_SF), EQZ, NEZ, GE, LT, X_SF, X64_JMP_JS); \ + break; \ + case B + 0x9: \ + INST_NAME(T1 "NS " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_SF), NEZ, EQZ, LT, GE, X_SF, X64_JMP_JNS); \ + break; \ + case B + 0xA: \ + INST_NAME(T1 "P " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_PF), EQZ, NEZ, _, _, X_PF, X64_JMP_JP); \ + break; \ + case B + 0xB: \ + INST_NAME(T1 "NP " T2); \ + GO(ANDId(tmp1, xFlags, 1 << F_PF), NEZ, EQZ, _, _, X_PF, X64_JMP_JNP); \ + break; \ + case B + 0xC: \ + INST_NAME(T1 "L " T2); \ + GO(SRDI(tmp1, xFlags, F_OF - F_SF); \ + XOR(tmp1, tmp1, xFlags); \ + ANDId(tmp1, tmp1, 1 << F_SF), EQZ, NEZ, GE, LT, X_SF | X_OF, X64_JMP_JL); \ + break; \ + case B + 0xD: \ + INST_NAME(T1 "GE " T2); \ + GO(SRDI(tmp1, xFlags, F_OF - F_SF); \ + XOR(tmp1, tmp1, xFlags); \ + ANDId(tmp1, tmp1, 1 << F_SF), NEZ, EQZ, LT, GE, X_SF | X_OF, X64_JMP_JGE); \ + break; \ + case B + 0xE: \ + INST_NAME(T1 "LE " T2); \ + GO(SRDI(tmp1, xFlags, F_OF - F_SF); \ + XOR(tmp1, tmp1, xFlags); \ + ANDId(tmp1, tmp1, 1 << F_SF); \ + ANDId(tmp3, xFlags, 1 << F_ZF); \ + OR(tmp1, tmp1, tmp3); \ + ANDId(tmp1, tmp1, (1 << F_SF) | (1 << F_ZF)), EQZ, NEZ, GT, LE, X_SF | X_OF | X_ZF, X64_JMP_JLE); \ + break; \ + case B + 0xF: \ + INST_NAME(T1 "G " T2); \ + GO(SRDI(tmp1, xFlags, F_OF - F_SF); \ + XOR(tmp1, tmp1, xFlags); \ + ANDId(tmp1, tmp1, 1 << F_SF); \ + ANDId(tmp3, xFlags, 1 << F_ZF); \ + OR(tmp1, tmp1, tmp3); \ + ANDId(tmp1, tmp1, (1 << F_SF) | (1 << F_ZF)), NEZ, EQZ, LE, GT, X_SF | X_OF | X_ZF, X64_JMP_JG); \ + break + +// ======================================================================== +// Native flag fusion macros for PPC64LE +// ======================================================================== +// PPC64LE needs separate compare + branch instructions (unlike LA64 which +// has combined compare-and-branch). So: +// B##COND##_safe(r1, r2, imm) = 3 instructions (matches BNEZ_safe/BEZ_safe sizing) +// B##COND##_(r1, r2, imm) = 2 instructions (matches BxxZ_gen sizing) +// +// Signed conditions use CMPD, unsigned use CMPLD. +// Near/far branch ranges: conditional ±32KB, unconditional ±32MB. +// +// IMPORTANT: xZR == r0 == 0, but r0 is NOT a hardware zero register on PPC64LE. +// When r2 == xZR (0), we must use CMPDI/CMPLDI with immediate 0 instead of +// CMPD/CMPLD against r0 (which may hold a non-zero value). +#define CMPD_ZR(r1, r2) do { if ((r2) == xZR) CMPDI(r1, 0); else CMPD(r1, r2); } while(0) +#define CMPLD_ZR(r1, r2) do { if ((r2) == xZR) CMPLDI(r1, 0); else CMPLD(r1, r2); } while(0) + +// --- 3-arg B##COND##_safe: always 3 instructions --- +// Pattern: CMPD/CMPLD + near(Bcond+NOP) or far(inv_Bcond+B) +// imm is offset from start of this macro to target. +// Bcond sits at +4, so near Bcond offset = imm-4. +// For far case: inv_Bcond skips over B (offset=8), B sits at +8 so B offset = imm-8. + +#define BEQ_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BEQ((imm) - 4); \ + NOP(); \ + } else { \ + BNE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BNE_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BNE((imm) - 4); \ + NOP(); \ + } else { \ + BEQ(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BLT_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BLT((imm) - 4); \ + NOP(); \ + } else { \ + BGE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BGE_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BGE((imm) - 4); \ + NOP(); \ + } else { \ + BLT(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BGT_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BGT((imm) - 4); \ + NOP(); \ + } else { \ + BLE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BLE_safe(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BLE((imm) - 4); \ + NOP(); \ + } else { \ + BGT(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +// Unsigned variants use CMPLD +#define BLTU_safe(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BLT((imm) - 4); \ + NOP(); \ + } else { \ + BGE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BGEU_safe(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BGE((imm) - 4); \ + NOP(); \ + } else { \ + BLT(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BGTU_safe(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BGT((imm) - 4); \ + NOP(); \ + } else { \ + BLE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +#define BLEU_safe(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BLE((imm) - 4); \ + NOP(); \ + } else { \ + BGT(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +// B__safe: unconditional placeholder (dead code, never reached in fusion path) +// Must be 3 instructions to match _safe sizing. +#define B__safe(a, b, imm) \ + do { \ + NOP(); \ + NOP(); \ + NOP(); \ + } while (0) + +// --- 3-arg B##COND##_: always 2 instructions (matches BxxZ_gen sizing) --- +// Pattern: CMPD/CMPLD + Bcond +// imm is offset from start of this macro to target. Bcond at +4, so offset = imm-4. + +#define BEQ_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BEQ((imm) - 4); \ + } while (0) + +#define BNE_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BNE((imm) - 4); \ + } while (0) + +#define BLT_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BLT((imm) - 4); \ + } while (0) + +#define BGE_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BGE((imm) - 4); \ + } while (0) + +#define BGT_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BGT((imm) - 4); \ + } while (0) + +#define BLE_(r1, r2, imm) \ + do { \ + CMPD_ZR(r1, r2); \ + BLE((imm) - 4); \ + } while (0) + +// Unsigned variants use CMPLD +#define BLTU_(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + BLT((imm) - 4); \ + } while (0) + +#define BGEU_(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + BGE((imm) - 4); \ + } while (0) + +#define BGTU_(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + BGT((imm) - 4); \ + } while (0) + +#define BLEU_(r1, r2, imm) \ + do { \ + CMPLD_ZR(r1, r2); \ + BLE((imm) - 4); \ + } while (0) + +// B__: unconditional placeholder (dead code). Must be 2 instructions. +#define B__(a, b, imm) \ + do { \ + NOP(); \ + NOP(); \ + } while (0) + +// --- 3-arg S##COND##_: NATIVESET (set rd = (op1 COND op2) ? 1 : 0) --- +// Pattern: CMPD/CMPLD + MFCR + extract CR0 bit +// CR0 bits in MFCR result (32-bit): LT=bit31, GT=bit30, EQ=bit29, SO=bit28 +// RLWINM(dst, dst, shift, 31, 31) extracts one bit to position 0. +// LT: shift=1, GT: shift=2, EQ: shift=3 + +// Direct conditions: extract the relevant bit +#define SLT_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 1, 31, 31); \ + } while (0) + +#define SGT_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 2, 31, 31); \ + } while (0) + +#define SEQ_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 3, 31, 31); \ + } while (0) + +// Inverted conditions: extract the opposite bit and XOR with 1 +#define SGE_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 1, 31, 31); \ + XORI(dst, dst, 1); \ + } while (0) + +#define SLE_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 2, 31, 31); \ + XORI(dst, dst, 1); \ + } while (0) + +#define SNE_(dst, r1, r2) \ + do { \ + CMPD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 3, 31, 31); \ + XORI(dst, dst, 1); \ + } while (0) + +// Unsigned variants use CMPLD +#define SLTU_(dst, r1, r2) \ + do { \ + CMPLD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 1, 31, 31); \ + } while (0) + +#define SGTU_(dst, r1, r2) \ + do { \ + CMPLD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 2, 31, 31); \ + } while (0) + +#define SGEU_(dst, r1, r2) \ + do { \ + CMPLD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 1, 31, 31); \ + XORI(dst, dst, 1); \ + } while (0) + +#define SLEU_(dst, r1, r2) \ + do { \ + CMPLD_ZR(r1, r2); \ + MFCR(dst); \ + RLWINM(dst, dst, 2, 31, 31); \ + XORI(dst, dst, 1); \ + } while (0) + +// S__: unconditional placeholder (dead code). Sets dst=1 unconditionally. +#define S__(dst, r1, r2) \ + do { \ + LI(dst, 1); \ + } while (0) + +// --- 4-arg MV##COND##_: NATIVEMV (dst = (op1 COND op2) ? src : dst) --- +// Not used by PPC64LE opcode tables currently. Stub with NOPs. +#define MV__(a, b, c, d) NOP() +#define MVGT_(a, b, c, d) NOP() +#define MVLE_(a, b, c, d) NOP() +#define MVLT_(a, b, c, d) NOP() +#define MVGE_(a, b, c, d) NOP() +#define MVGTU_(a, b, c, d) NOP() +#define MVLEU_(a, b, c, d) NOP() +#define MVLTU_(a, b, c, d) NOP() +#define MVGEU_(a, b, c, d) NOP() +#define MVEQ_(a, b, c, d) NOP() +#define MVNE_(a, b, c, d) NOP() + +#define NATIVEJUMP_safe(COND, val) \ + B##COND##_safe(dyn->insts[ninst].nat_flags_op1, dyn->insts[ninst].nat_flags_op2, val); + +#define NATIVEJUMP(COND, val) \ + B##COND##_(dyn->insts[ninst].nat_flags_op1, dyn->insts[ninst].nat_flags_op2, val); + +#define NATIVESET(COND, rd) \ + S##COND##_(rd, dyn->insts[ninst].nat_flags_op1, dyn->insts[ninst].nat_flags_op2); + +#define NATIVEMV(COND, rd, rs) \ + MV##COND##_(rd, rs, dyn->insts[ninst].nat_flags_op1, dyn->insts[ninst].nat_flags_op2); + +// ======================================================================== +// NOTEST / SKIPTEST / GOTEST +// ======================================================================== +// PPC64LE: cannot store xZR, must use LI(scratch, 0) + STW +#define NOTEST(s1) \ + if (BOX64ENV(dynarec_test)) { \ + LI(s1, 0); \ + STW(s1, offsetof(x64emu_t, test.test), xEmu); \ + STW(s1, offsetof(x64emu_t, test.clean), xEmu); \ + } + +#define SKIPTEST(s1) \ + if (BOX64ENV(dynarec_test)) { \ + LI(s1, 0); \ + STW(s1, offsetof(x64emu_t, test.clean), xEmu); \ + } + +#define GOTEST(s1, s2) \ + if (BOX64ENV(dynarec_test)) { \ + MOV32w(s2, 1); \ + STW(s2, offsetof(x64emu_t, test.test), xEmu); \ + } + +#define GETREX() \ + rex.rex = 0; \ + if (!rex.is32bits) \ + while (opcode >= 0x40 && opcode <= 0x4f) { \ + rex.rex = opcode; \ + opcode = F8; \ + } + +// ======================================================================== +// FCOM — set x87 Status Word C0/C2/C3 from FP comparison +// ======================================================================== +// x87 SW bits: C0=bit8, C2=bit10, C3=bit14 +// GT: C3=0,C2=0,C0=0 -> 0x0000 +// LT: C3=0,C2=0,C0=1 -> 0x0100 +// EQ: C3=1,C2=0,C0=0 -> 0x4000 +// UN: C3=1,C2=1,C0=1 -> 0x4500 +// Mask to clear C0,C1,C2,C3: ~(0x4700) = 0xB8FF +// Layout after FCMPU (all MOV32w values fit in signed 16-bit = 1 insn each): +// +0: BC SO -> +5*4 (-> UN handler at +20) +// +4: BEQ -> +6*4 (-> EQ handler at +28) +// +8: BLT -> +7*4 (-> LT handler at +36) +// +12: LI s1, 0 (GT) +// +16: B +6*4 (-> end at +40) +// +20: LI s1, 0x4500 (UN) +// +24: B +4*4 (-> end at +40) +// +28: LI s1, 0x4000 (EQ) +// +32: B +2*4 (-> end at +40) +// +36: LI s1, 0x0100 (LT) +// +40: OR / STH (end) +#define FCOM(w, v1, v2, s1, s2, s3) \ + LHZ(s3, offsetof(x64emu_t, sw), xEmu); \ + ANDId(s3, s3, 0xB8FF); /* mask off C0(8),C2(10),C3(14) */ \ + FCMPU(0, v1, v2); \ + /* Build result branchlessly using ISEL */ \ + LI(s1, 0); /* zero for ISEL false case */ \ + /* LT → C0 (0x0100) */ \ + LI(s2, 0x0100); \ + ISEL(s2, s2, s1, BI(CR0, CR_LT)); /* s2 = LT ? 0x0100 : 0 */ \ + OR(s3, s3, s2); \ + /* EQ → C3 (0x4000) */ \ + LI(s2, 0x4000); \ + ISEL(s2, s2, s1, BI(CR0, CR_EQ)); /* s2 = EQ ? 0x4000 : 0 */ \ + OR(s3, s3, s2); \ + /* SO → C0|C2|C3 (0x4500) */ \ + LI(s2, 0x4500); \ + ISEL(s2, s2, s1, BI(CR0, CR_SO)); /* s2 = SO ? 0x4500 : 0 */ \ + OR(s3, s3, s2); \ + STH(s3, offsetof(x64emu_t, sw), xEmu); + +#define FCOMS(v1, v2, s1, s2, s3) FCOM(S, v1, v2, s1, s2, s3) +#define FCOMD(v1, v2, s1, s2, s3) FCOM(D, v1, v2, s1, s2, s3) + +// ======================================================================== +// FCOMI — set EFLAGS CF/PF/ZF from FP comparison +// ======================================================================== +// GT: CF=0,PF=0,ZF=0 -> 0x00 +// LT: CF=1,PF=0,ZF=0 -> 0x01 (bit 0) +// EQ: CF=0,PF=0,ZF=1 -> 0x40 (bit 6) +// UN: CF=1,PF=1,ZF=1 -> 0x45 (bits 0,2,6) +// Also clear OF(bit11), AF(bit4), SF(bit7) +// Layout after FCMPU (all values fit in signed 16-bit = 1 insn each): +// +0: BC SO -> +5*4 (-> UN handler at +20) +// +4: BEQ -> +6*4 (-> EQ handler at +28) +// +8: BLT -> +7*4 (-> LT handler at +36) +// +12: LI s1, 0 (GT) +// +16: B +6*4 (-> end at +40) +// +20: MR s1, s2 (UN: s2 still has 0x45) +// +24: B +4*4 (-> end at +40) +// +28: LI s1, 0x40 (EQ) +// +32: B +2*4 (-> end at +40) +// +36: LI s1, 1 (LT) +// +40: OR (end) +#define FCOMI(w, v1, v2, s1, s2) \ + IFX (X_OF | X_AF | X_SF | X_PEND) { \ + MOV64x(s2, ((1 << F_OF) | (1 << F_AF) | (1 << F_SF))); \ + ANDC(xFlags, xFlags, s2); \ + } \ + IFX (X_CF | X_PF | X_ZF | X_PEND) { \ + MOV32w(s2, 0b01000101); /* CF|PF|ZF mask = 0x45 */ \ + ANDC(xFlags, xFlags, s2); \ + FCMPU(0, v1, v2); \ + BC(BO_TRUE, BI(CR0, CR_SO), 5*4); /* -> UN at +20 */ \ + BEQ(6*4); /* -> EQ at +28 */ \ + BLT(7*4); /* -> LT at +36 */ \ + /* GT: s1 = 0 */ \ + LI(s1, 0); \ + B(6*4); /* -> end at +40 */ \ + /* UN: CF|PF|ZF = 0x45 */ \ + MR(s1, s2); /* s2 already has 0x45 */ \ + B(4*4); /* -> end at +40 */ \ + /* EQ: ZF = 0x40 */ \ + LI(s1, 0b01000000); \ + B(2*4); /* -> end at +40 */ \ + /* LT: CF = 1 */ \ + LI(s1, 1); \ + /* end: merge into flags */ \ + OR(xFlags, xFlags, s1); \ + } \ + SET_DFNONE() + +#define FCOMIS(v1, v2, s1, s2) FCOMI(S, v1, v2, s1, s2) +#define FCOMID(v1, v2, s1, s2) FCOMI(D, v1, v2, s1, s2) + +// PPC64LE has no LBT — RESTORE_EFLAGS / SPILL_EFLAGS are empty +#define RESTORE_EFLAGS(s) +#define SPILL_EFLAGS() + +// SNEZ: Set if Not Equal to Zero — dst = (src != 0) ? 1 : 0 +// Uses SUBFIC+SUBFE+NEG sequence: carry-based detection +#define SNEZ(dst, src) \ + do { \ + SUBFIC(dst, src, 0); \ + SUBFE(dst, dst, dst); \ + NEG(dst, dst); \ + } while (0) + +// SEQZ: Set Equal Zero — dst = (src == 0) ? 1 : 0 +#define SEQZ(dst, src) \ + do { \ + CNTLZD(dst, src); \ + SRDI(dst, dst, 6); \ + } while (0) + +// SLTI: Set Less Than Immediate — dst = (src < imm) ? 1 : 0 +// PPC64LE has no direct SLTI; use CMPDI + extract LT bit from CR0 +#define SLTI(dst, src, imm) \ + do { \ + CMPDI(src, imm); \ + MFCR(dst); \ + RLWINM(dst, dst, 1, 31, 31); \ + } while (0) + +// LDXxw: Indexed load, 32/64 conditional on rex.w +#define LDXxw(rd, base, idx) \ + do { \ + if (rex.w) \ + LDX(rd, base, idx); \ + else \ + LWZX(rd, base, idx); \ + } while (0) + +// MOD_DU: unsigned 64-bit modulo (dst = dst % divisor), uses x3 as scratch +// PPC64LE has no modulo — implement as: q = dst / divisor; dst = dst - q * divisor +#define MOD_DU(dst, dst2, divisor) \ + do { \ + DIVDU(x3, dst, divisor); \ + MULLD(x3, x3, divisor); \ + SUB(dst, dst, x3); \ + } while (0) + +// MOD_D: signed 64-bit modulo (dst = dst % divisor), uses x3 as scratch +#define MOD_D(dst, dst2, divisor) \ + do { \ + DIVD(x3, dst, divisor); \ + MULLD(x3, x3, divisor); \ + SUB(dst, dst, x3); \ + } while (0) + +// MOD_WU: unsigned 32-bit modulo (dst = dst % divisor), uses x3 as scratch +#define MOD_WU(dst, dst2, divisor) \ + do { \ + DIVWU(x3, dst, divisor); \ + MULLW(x3, x3, divisor); \ + SUB(dst, dst, x3); \ + } while (0) + +// MOD_W: signed 32-bit modulo (dst = dst % divisor), uses x3 as scratch +#define MOD_W(dst, dst2, divisor) \ + do { \ + DIVW(x3, dst, divisor); \ + MULLW(x3, x3, divisor); \ + SUB(dst, dst, x3); \ + } while (0) + +#define PURGE_YMM() + +#define ALIGNED_ATOMICxw ((fixedaddress && !(fixedaddress & (((1 << (2 + rex.w)) - 1)))) || BOX64ENV(dynarec_aligned_atomics)) +#define ALIGNED_ATOMICH ((fixedaddress && !(fixedaddress & 1)) || BOX64ENV(dynarec_aligned_atomics)) + +// ======================================================================== +// LOCK_* macros for PPC64LE using LL/SC (lwarx/stwcx., ldarx/stdcx.) +// ======================================================================== +// PPC64LE has byte/half/word/dword LL/SC, so byte ops can use LBARX/STBCXd directly. + +// LOCK_8_OP: atomic byte operation using LBARX/STBCXd +// s1 = old byte value (output), wback = address, op uses s1 as input and puts result in s4 +// Typical: LOCK_8_OP(ADD(s4, s1, gd), s1, wback, s3, s4, s5, s6) +#define LOCK_8_OP(op, s1, wback, s3, s4, s5, s6) \ + do { \ + LWSYNC(); \ + MARKLOCK; \ + LBARX(s1, 0, wback); \ + op; \ + STBCXd(s4, 0, wback); \ + BNE_MARKLOCK_CR0; \ + LWSYNC(); \ + } while (0) + +// LOCK_32_IN_8BYTE: atomic 32-bit op on unaligned address within an 8-byte block +// Uses LDARX/STDCXd on the containing 8-byte-aligned address +// s1 = old 32-bit value (output), wback = original address, op computes new value in s4 +#define LOCK_32_IN_8BYTE(op, s1, wback, s3, s4, s5, s6) \ + do { \ + ANDI(s3, wback, 0b100); /* byte offset within 8B: 0 or 4 */ \ + SLDI(s3, s3, 3); /* bit offset: 0 or 32 */ \ + RLDICR(s6, wback, 0, 60); /* align to 8 bytes: wback & ~7 */ \ + LWSYNC(); \ + MARKLOCK; \ + LDARX(s5, 0, s6); /* load-linked 8 bytes */ \ + SRD(s1, s5, s3); /* shift old 32-bit value down */ \ + RLDICL(s1, s1, 0, 32); /* zero upper 32 bits */ \ + op; /* s4 = f(s1, operand) */ \ + /* insert s4 back: clear 32-bit field, OR in new value */ \ + LI(s1, -1); /* temp all-ones */ \ + RLDICL(s1, s1, 0, 32); /* mask = 0x00000000FFFFFFFF */ \ + SLD(s1, s1, s3); /* shift mask to position */ \ + ANDC(s5, s5, s1); /* clear old field */ \ + RLDICL(s4, s4, 0, 32); /* zero upper bits of new value */ \ + SLD(s4, s4, s3); /* shift new value to position */ \ + OR(s5, s5, s4); /* insert new value */ \ + STDCXd(s5, 0, s6); /* store-conditional */ \ + BNE_MARKLOCK_CR0; \ + LWSYNC(); \ + /* restore s1 = old 32-bit value */ \ + SRD(s1, s5, s3); \ + RLDICL(s1, s1, 0, 32); \ + } while (0) + +// LOCK_3264_CROSS_8BYTE: best-effort atomic for values crossing an 8-byte boundary +// Uses LDARX/STDCXd as a lock on the lower aligned 8B, then non-atomic load/modify/store +#define LOCK_3264_CROSS_8BYTE(op, s1, wback, s4, s5, s6) \ + do { \ + RLDICR(s6, wback, 0, 60); /* align to 8 bytes */ \ + LWSYNC(); \ + MARKLOCK; \ + LDARX(s5, 0, s6); /* reservation lock */ \ + if (rex.w) { LD(s1, 0, wback); } else { LWZ(s1, 0, wback); } \ + op; /* s4 = f(s1, operand) */ \ + if (rex.w) { STD(s4, 0, wback); } else { STW(s4, 0, wback); } \ + STDCXd(s5, 0, s6); /* release reservation */ \ + BNE_MARKLOCK_CR0; \ + LWSYNC(); \ + } while (0) + +#ifndef SCRATCH_USAGE +#define SCRATCH_USAGE(usage) +#endif + +// ======================================================================== +// REVBxw — byte-reverse (BSWAP) for 32 or 64 bits +// ======================================================================== +// Uses stack red zone + scratch register for store-load byte-reverse +// Note: tmp is a scratch GPR +#define REVBxw(Rd, Rs, tmp) \ + do { \ + if (rex.w) { \ + STD(Rs, -8, xSP); \ + LI(tmp, -8); \ + LDBRX(Rd, xSP, tmp); \ + } else { \ + STW(Rs, -4, xSP); \ + LI(tmp, -4); \ + LWBRX(Rd, xSP, tmp); \ + ZEROUP(Rd); \ + } \ + } while (0) + +#endif //__DYNAREC_PPC64LE_HELPER_H__ diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_jmpnext.c b/src/dynarec/ppc64le/dynarec_ppc64le_jmpnext.c new file mode 100644 index 0000000000..6661a14ded --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_jmpnext.c @@ -0,0 +1,30 @@ +#include + +#include "ppc64le_emitter.h" + +#define EMIT(A) \ + do { \ + *block = (A); \ + ++block; \ + } while (0) +void CreateJmpNext(void* addr, void* next) +{ + uint32_t* block = (uint32_t*)addr; + // PC-relative load of the target address stored at 'next'. + // BCL 20,31,.+4 sets LR = address of the next instruction (addr+4). + // We compute the offset from LR to 'next' and use LD to load it. + // BCL 20,31,.+4 -> LR = addr+4 + // MFLR r12 -> r12 = addr+4 + // LD r12, offset(r12) -> r12 = *(addr+4+offset) = *(next) = target + // MTCTR r12 + // BCTR + // Total: 5 instructions = 20 bytes, fits in JMPNEXT_SIZE (40 bytes) + // with 8 bytes for block ptr at start and 8 bytes for target ptr at end. + int reg = 12; // r12 is scratch + intptr_t offset = (intptr_t)next - ((intptr_t)addr + 4); + BCL(20, 31, 4); + MFLR(reg); + LD(reg, (int16_t)offset, reg); + MTCTR(reg); + BCTR(); +} diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_pass0.h b/src/dynarec/ppc64le/dynarec_ppc64le_pass0.h new file mode 100644 index 0000000000..daa668a46d --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_pass0.h @@ -0,0 +1,97 @@ + +#define INIT uintptr_t sav_addr=addr +#define FINI \ + dyn->isize = addr-sav_addr; \ + dyn->insts[ninst].x64.addr = addr; \ + if(ninst) dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr + +#define MESSAGE(A, ...) do {} while (0) +#define READFLAGS(A) \ + do { \ + dyn->insts[ninst].x64.use_flags = A; \ + if (dyn->f != status_none_pending) dyn->f = status_none; \ + if (!BOX64ENV(dynarec_df) && (A) & X_PEND) dyn->insts[ninst].x64.use_flags = X_ALL; \ + dyn->f = status_none; \ + } while (0) + +#define READFLAGS_FUSION(A, s1, s2, s3, s4, s5) \ + if (BOX64ENV(dynarec_nativeflags) && !BOX64ENV(dynarec_test) && ninst > 0) { \ + int prev = ninst - 1; \ + while (prev && dyn->insts[prev].no_scratch_usage) \ + prev -= 1; \ + if (!dyn->insts[prev].nat_flags_nofusion) { \ + if ((A) == (X_ZF)) \ + dyn->insts[ninst].nat_flags_fusion = 1; \ + else if (dyn->insts[prev].nat_flags_carry && ((A) == (X_CF) || (A) == (X_CF | X_ZF))) \ + dyn->insts[ninst].nat_flags_fusion = 1; \ + else if (dyn->insts[prev].nat_flags_sign && ((A) == (X_SF | X_OF) || (A) == (X_SF | X_OF | X_ZF))) \ + dyn->insts[ninst].nat_flags_fusion = 1; \ + else if (dyn->insts[prev].nat_flags_sf && dyn->insts[prev].nat_flags_sign && (A) == X_SF) \ + dyn->insts[ninst].nat_flags_fusion = 1; \ + } \ + } \ + READFLAGS(A); +#define SETFLAGS(A, B, FUSION) \ + do { \ + dyn->insts[ninst].x64.set_flags = A; \ + dyn->insts[ninst].x64.state_flags = (B) & ~SF_DF; \ + dyn->f = ((B) & SF_SET) ? (((B) == SF_SET_NODF) ? dyn->f : status_none_pending) : (((B) & SF_SET_PENDING) ? status_set : status_none_pending); \ + if (!BOX64ENV(dynarec_df)) { \ + dyn->f = status_none; \ + if ((A) == SF_PENDING) { \ + printf_log(LOG_INFO, "Warning, some opcode use SF_PENDING, forcing deferedflags ON\n"); \ + SET_BOX64ENV(dynarec_df, 1); \ + } \ + } \ + dyn->insts[ninst].nat_flags_nofusion = (FUSION); \ + } while (0) +#define EMIT(A) dyn->native_size+=4 +#define JUMP(A, C) add_jump(dyn, ninst); add_next(dyn, (uintptr_t)A); SMEND(); dyn->insts[ninst].x64.jmp = A; dyn->insts[ninst].x64.jmp_cond = C; dyn->insts[ninst].x64.jmp_insts = 0 +#define BARRIER(A) if(A!=BARRIER_MAYBE) {fpu_purgecache(dyn, ninst, 0, x1, x2, x3); dyn->insts[ninst].x64.barrier = A;} else dyn->insts[ninst].barrier_maybe = 1 +#define SET_HASCALLRET() dyn->insts[ninst].x64.has_callret = 1 +#define NEW_INST \ + ++dyn->size; \ + dyn->insts[ninst].x64.addr = ip; \ + dyn->v.combined1 = dyn->v.combined2 = 0;\ + dyn->v.swapped = 0; dyn->v.barrier = 0; \ + dyn->insts[ninst].f_entry = dyn->f; \ + if(ninst) {dyn->insts[ninst-1].x64.size = dyn->insts[ninst].x64.addr - dyn->insts[ninst-1].x64.addr;} \ + AREFLAGSNEEDED() + +#define INST_EPILOG \ + dyn->insts[ninst].f_exit = dyn->f; \ + dyn->insts[ninst].v = dyn->v; \ + dyn->insts[ninst].x64.has_next = (ok>0)?1:0; +#define INST_NAME(name) +#define DEFAULT \ + --dyn->size; \ + *ok = -1; \ + if (ninst) { dyn->insts[ninst - 1].x64.size = ip - dyn->insts[ninst - 1].x64.addr; } \ + if (BOX64ENV(dynarec_log) >= LOG_INFO || dyn->need_dump || BOX64ENV(dynarec_missing) == 1) \ + if (!dyn->size || BOX64ENV(dynarec_log) > LOG_INFO || dyn->need_dump) { \ + dynarec_stopped(dyn->insts[ninst].x64.addr, rex.is32bits); \ + } + +#define UFLAG_IF if(dyn->insts[ninst].x64.set_flags) +#define UFLAG_IF2(A) if(dyn->insts[ninst].x64.set_flags A) +#define IFX(A) if((dyn->insts[ninst].x64.set_flags&(A))) +#define IFX2(A, B) if((dyn->insts[ninst].x64.set_flags&(A)) B) +#define IFX_PENDOR0 if((dyn->insts[ninst].x64.set_flags&(X_PEND) || !dyn->insts[ninst].x64.set_flags)) +#define IFXX(A) if((dyn->insts[ninst].x64.set_flags==(A))) +#define IFX2X(A, B) if((dyn->insts[ninst].x64.set_flags==(A) || dyn->insts[ninst].x64.set_flags==(B) || dyn->insts[ninst].x64.set_flags==((A)|(B)))) +#define IFXN(A, B) if((dyn->insts[ninst].x64.set_flags&(A) && !(dyn->insts[ninst].x64.set_flags&(B)))) + +// mark opcode as "unaligned" possible only if the current address is not marked as already unaligned +#define IF_UNALIGNED(A) if((dyn->insts[ninst].unaligned=is_addr_unaligned(A))) +#define IF_ALIGNED(A) if(!(dyn->insts[ninst].unaligned=is_addr_unaligned(A))) + +#define NATIVE_RESTORE_X87PC() +#define X87_CHECK_PRECISION(A) \ + do { \ + if (dyn->need_x87check) dyn->insts[ninst].x87precision = 1; \ + } while (0) + +#define SCRATCH_USAGE(usage) \ + do { \ + dyn->insts[ninst].no_scratch_usage = !usage; \ + } while (0) diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_pass1.h b/src/dynarec/ppc64le/dynarec_ppc64le_pass1.h new file mode 100644 index 0000000000..2322694398 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_pass1.h @@ -0,0 +1,31 @@ +#define INIT +#define FINI +#define MESSAGE(A, ...) do {} while (0) +#define EMIT(A) do {} while (0) + +#define SETFLAGS(A,B,FUSION) \ + dyn->insts[ninst].x64.set_flags = A; \ + dyn->insts[ninst].x64.state_flags = (B)&~SF_DF; \ + dyn->f=((B)&SF_SET)?(((B)==SF_SET_NODF)?dyn->f:status_none_pending):(((B)&SF_SET_PENDING)?status_set:status_none_pending); \ + if(!BOX64ENV(dynarec_df)) {dyn->f = status_none; } +#define NEW_INST \ + dyn->insts[ninst].f_entry = dyn->f; \ + dyn->v.combined1 = dyn->v.combined2 = 0;\ + dyn->v.swapped = 0; dyn->v.barrier = 0; \ + AREFLAGSNEEDED() + +#define INST_EPILOG \ + dyn->insts[ninst].v = dyn->v; \ + dyn->insts[ninst].f_exit = dyn->f + +#define INST_NAME(name) + +#define NATIVE_RESTORE_X87PC() +#define X87_CHECK_PRECISION(A) \ + do { \ + if (dyn->need_x87check) { \ + dyn->insts[ninst].x87precision = 1; \ + if (!ST_IS_F(0)) \ + dyn->need_x87check = 2; \ + } \ + } while (0) diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_pass2.h b/src/dynarec/ppc64le/dynarec_ppc64le_pass2.h new file mode 100644 index 0000000000..3fcbdaad08 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_pass2.h @@ -0,0 +1,43 @@ +#define INIT dyn->native_size = 0 +#define FINI \ + if(ninst) { \ + dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ + dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \ + } + +#define MESSAGE(A, ...) do {} while (0) +#define EMIT(A) do{dyn->insts[ninst].size+=4; dyn->native_size+=4;}while(0) +#define ENDPREFIX dyn->prefixsize = dyn->native_size; dyn->insts[ninst].address = dyn->native_size; dyn->insts[ninst].size = 0 +#define NEW_INST \ + if(ninst) { \ + dyn->insts[ninst].address = (dyn->insts[ninst-1].address+dyn->insts[ninst-1].size); \ + dyn->insts_size += 1+((dyn->insts[ninst-1].x64.size>(dyn->insts[ninst-1].size/4))?dyn->insts[ninst-1].x64.size:(dyn->insts[ninst-1].size/4))/15; \ + dyn->insts[ninst].ymm0_pass2 = dyn->ymm_zero; \ + } \ + AREFLAGSNEEDED() +#define INST_EPILOG dyn->insts[ninst].epilog = dyn->native_size; +#define INST_NAME(name) +// TABLE64: PPC64LE uses 4 instructions to load from the constant pool: +// bcl 20,31,.+4 (get PC into LR) +// mflr Rd +// addis Rd,Rd,hi(offset-4) +// ld Rd,lo(offset-4)(Rd) +#define TABLE64(A, V) do { \ + if(dyn->need_reloc) \ + AddRelocTable64Addr(dyn, ninst, (V), 2); \ + Table64(dyn, (V), 2); EMIT(0); EMIT(0); EMIT(0); EMIT(0); \ + } while(0) +#define TABLE64_(A, V) do { \ + Table64(dyn, (V), 2); EMIT(0); EMIT(0); EMIT(0); EMIT(0); \ + } while(0) +#define TABLE64C(A, V) do { \ + if(dyn->need_reloc) \ + AddRelocTable64Const(dyn, ninst, (V), 2); \ + Table64(dyn, getConst(V), 2); EMIT(0); EMIT(0); EMIT(0); EMIT(0); \ + } while(0) +#define FTABLE64(A, V) do { mmx87_regs_t v = {.d = V}; Table64(dyn, v.q, 2); EMIT(0); EMIT(0); EMIT(0); EMIT(0); } while(0) +#define CALLRET_RET(A) do { \ + if((A) && BOX64DRENV(dynarec_callret)) { dyn->insts[ninst].size+=sizeof(void*); dyn->native_size+=sizeof(void*); ++dyn->sep_size; } \ + if(BOX64DRENV(dynarec_callret)>1) {dyn->callrets[dyn->callret_size].type = 0; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP);} \ + } while(0) +#define CALLRET_LOOP() do { dyn->callrets[dyn->callret_size].type = 1; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0) diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_pass3.h b/src/dynarec/ppc64le/dynarec_ppc64le_pass3.h new file mode 100644 index 0000000000..cbe705442a --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_pass3.h @@ -0,0 +1,98 @@ +#define INIT +#define FINI \ + if(ninst) \ + addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); \ + addInst(dyn->instsize, &dyn->insts_size, 0, 0); +#define EMIT(A) \ + do{ \ + if(dyn->need_dump) print_opcode(dyn, ninst, (uint32_t)(A)); \ + if((uintptr_t)dyn->blocktablestart) \ + *(uint32_t*)(dyn->block) = (uint32_t)(A); \ + dyn->block += 4; dyn->native_size += 4; \ + dyn->insts[ninst].size2 += 4; \ + }while(0) + +#define MESSAGE(A, ...) \ + do { \ + if (dyn->need_dump) dynarec_log(LOG_NONE, __VA_ARGS__); \ + } while (0) +#define ENDPREFIX dyn->insts[ninst].size2 = 0 +#define NEW_INST \ + if(ninst) { \ + if(dyn->insts[ninst].address!=(uintptr_t)dyn->block-(uintptr_t)dyn->native_start) dyn->abort = 1; \ + addInst(dyn->instsize, &dyn->insts_size, dyn->insts[ninst-1].x64.size, dyn->insts[ninst-1].size/4); \ + dyn->insts[ninst].ymm0_pass3 = dyn->ymm_zero; \ + } \ + AREFLAGSNEEDED() +#define INST_EPILOG +#define INST_NAME(name) inst_name_pass3(dyn, ninst, name, rex) +// TABLE64: PPC64LE loads from constant pool using 4 instructions: +// bcl 20,31,.+4 (get PC into LR, BO=20 "always", BI=31) +// mflr Rd (move LR to target register) +// addis Rd,Rd,ha(delta) (add upper 16 bits of adjusted offset) +// ld Rd,lo(delta)(Rd) (load 64-bit value with lower offset) +// where delta = table_entry_addr - (addr_of_bcl + 4) +// We use helper macros to split the offset: +// PPC64_HI16(x) = upper 16 bits with sign adjustment for lower half +// PPC64_LO16(x) = lower 16 bits (sign-extended by hardware) +#define PPC64_LO16(x) ((int16_t)((x) & 0xFFFF)) +#define PPC64_HI16(x) ((int16_t)((((x) >> 16) + (((x) & 0x8000) ? 1 : 0)) & 0xFFFF)) +#define TABLE64(A, V) do { \ + if(dyn->need_reloc) \ + AddRelocTable64Addr(dyn, ninst, (V), 3); \ + int val64offset = Table64(dyn, (V), 3); \ + int _delta = val64offset - 4; \ + MESSAGE(LOG_DUMP, " Table64: 0x%lx (offset %d)\n", (V), val64offset); \ + BCL(20, 31, 4); \ + MFLR(A); \ + ADDIS(A, A, PPC64_HI16(_delta)); \ + LD(A, PPC64_LO16(_delta), A); \ + } while(0) +#define TABLE64_(A, V) { \ + int val64offset = Table64(dyn, (V), 3); \ + int _delta = val64offset - 4; \ + MESSAGE(LOG_DUMP, " Table64: 0x%lx (offset %d)\n", (V), val64offset); \ + BCL(20, 31, 4); \ + MFLR(A); \ + ADDIS(A, A, PPC64_HI16(_delta)); \ + LD(A, PPC64_LO16(_delta), A); \ + } +#define TABLE64C(A, V) do { \ + if(dyn->need_reloc) \ + AddRelocTable64Const(dyn, ninst, (V), 3); \ + int val64offset = Table64(dyn, getConst(V), 3); \ + int _delta = val64offset - 4; \ + MESSAGE(LOG_DUMP, " Table64C: 0x%lx (offset %d)\n", (V), val64offset); \ + BCL(20, 31, 4); \ + MFLR(A); \ + ADDIS(A, A, PPC64_HI16(_delta)); \ + LD(A, PPC64_LO16(_delta), A); \ + } while(0) +#define FTABLE64(A, V) do { \ + mmx87_regs_t v = {.d = V}; \ + int val64offset = Table64(dyn, v.q, 3); \ + int _delta = val64offset - 4; \ + MESSAGE(LOG_DUMP, " FTable64: %g (offset %d)\n", v.d, val64offset); \ + BCL(20, 31, 4); \ + MFLR(x1); \ + ADDIS(x1, x1, PPC64_HI16(_delta)); \ + LFD(A, PPC64_LO16(_delta), x1); \ + } while(0) +#define CALLRET_RET(A) \ + do { \ + if((A) && BOX64DRENV(dynarec_callret)) { \ + MESSAGE(LOG_DUMP, " Dynablock*\n"); \ + dyn->block += sizeof(void*); \ + dyn->native_size+=sizeof(void*); \ + dyn->insts[ninst].size2 += sizeof(void*); \ + dyn->sep[dyn->sep_size].x64_offs = addr - dyn->start; \ + dyn->sep[dyn->sep_size].nat_offs = dyn->native_size; \ + ++dyn->sep_size; \ + } \ + if(BOX64DRENV(dynarec_callret)>1) { \ + dyn->callrets[dyn->callret_size].type = 0; \ + dyn->callrets[dyn->callret_size++].offs = dyn->native_size; \ + EMIT(ARCH_NOP); \ + } \ + } while(0) +#define CALLRET_LOOP() do {dyn->callrets[dyn->callret_size].type = 1; dyn->callrets[dyn->callret_size++].offs = dyn->native_size; EMIT(ARCH_NOP); } while(0) diff --git a/src/dynarec/ppc64le/dynarec_ppc64le_private.h b/src/dynarec/ppc64le/dynarec_ppc64le_private.h new file mode 100644 index 0000000000..97c7a9c5a8 --- /dev/null +++ b/src/dynarec/ppc64le/dynarec_ppc64le_private.h @@ -0,0 +1,227 @@ +#ifndef __DYNAREC_PPC64LE_PRIVATE_H_ +#define __DYNAREC_PPC64LE_PRIVATE_H_ + +#include + +#include "../dynarec_private.h" + +typedef struct x64emu_s x64emu_t; +typedef struct dynablock_s dynablock_t; +typedef struct instsize_s instsize_t; +typedef struct box64env_s box64env_t; + +#define BARRIER_MAYBE 8 + +// VMX cache types (mirrors ARM64's NEON cache / LA64's LSX cache) +// PPC64LE uses VMX (128-bit) registers vr0-vr31 (= vs32-vs63) for SIMD caching +#define VMX_CACHE_NONE 0 +#define VMX_CACHE_ST_D 1 +#define VMX_CACHE_ST_F 2 +#define VMX_CACHE_ST_I64 3 +#define VMX_CACHE_MM 4 +#define VMX_CACHE_XMMW 5 +#define VMX_CACHE_XMMR 6 +#define VMX_CACHE_YMMW 7 +#define VMX_CACHE_YMMR 8 +#define VMX_CACHE_SCR 9 + +typedef union vmx_cache_s { + int8_t v; + struct { + uint8_t t:4; // reg type + uint8_t n:4; // reg number + }; +} vmx_cache_t; + +typedef union sse_cache_s { + int8_t v; + struct { + uint8_t reg:7; + uint8_t write:1; + }; +} sse_cache_t; + +typedef union avx_cache_s { + int8_t v; + struct { + uint8_t reg : 5; + uint8_t width : 1; + uint8_t zero_upper : 1; + uint8_t write : 1; + }; +} avx_cache_t; + +typedef struct vmxcache_s { + // VMX cache + vmx_cache_t vmxcache[32]; + int8_t stack; + int8_t stack_next; + int8_t stack_pop; + int8_t stack_push; + uint8_t combined1; + uint8_t combined2; + uint8_t swapped; // the combined reg were swapped + uint8_t barrier; // is there a barrier at instruction epilog? + uint8_t pushed; // positive pushed value (to check for overflow) + uint8_t poped; // positive poped value (to check for underflow) + uint32_t news; // bitmask, which vmxcache are new for this opcode + // fpu cache + int8_t x87cache[8]; // cache status for the 8 x87 register behind the fpu stack + int8_t x87reg[8]; // reg used for x87cache entry + int16_t tags; // similar to fpu_tags + int8_t mmxcache[8]; // cache status for the 8 MMX registers + sse_cache_t ssecache[16]; // cache status for the 16 SSE(2) registers + avx_cache_t avxcache[16]; // cache status for the 16 AVX registers + int8_t fpuused[32]; // all vmx regs, used by x87, mmx, sse and avx + int8_t x87stack; // cache stack counter + int8_t mmxcount; // number of mmx register used (not both mmx and x87 at the same time) + int8_t fpu_scratch; // scratch counter + uint16_t xmm_write; // 1bit of xmmXX removed write + uint16_t xmm_removed; // 1bit if xmmXX was removed + uint16_t xmm_used; // mask of the xmm regs used in this opcode + uint16_t ymm_used; // mask of the ymm regs used in this opcode + uint16_t ymm_write; // 1bit of ymmXX removed write + uint16_t ymm_removed; // 1bit if ymmXX was removed + uint16_t xmm_unneeded; // 1bit for xmmXX were value is not needed + uint16_t ymm_unneeded; // 1bit for ymmXX were value is not needed + uint64_t ymm_regs; // 4bits (0-15) position of 16 ymmXX regs removed +} vmxcache_t; + +typedef enum flagcache_s { + status_unk = 0, // unknown deferred flags status + status_set, // deferred flags set to something (not 0) + status_none_pending, // deferred flags set to 0, but still pending the write to x64emu_t + status_none, // deferred flags set to 0, written to x64emu_t +} flagcache_t; + +typedef struct callret_s callret_t; +typedef struct sep_s sep_t; + +typedef struct instruction_ppc64le_s { + instruction_x64_t x64; + uintptr_t address; // (start) address of the ppc64le emitted instruction + uintptr_t epilog; // epilog of current instruction (can be start of next, or barrier stuff) + int size; // size of the ppc64le emitted instruction + int size2; // size of the ppc64le emitted instruction after pass2 + int pred_sz; // size of predecessor list + int *pred; // predecessor array + uintptr_t mark[3]; + uintptr_t markf[2]; + uintptr_t markseg; + uintptr_t marklock; + uintptr_t marklock2; + int pass2choice;// value for choices that are fixed on pass2 for pass3 + uintptr_t natcall; + uint16_t retn; + uint16_t purge_ymm; // need to purge some ymm + uint16_t ymm0_in; // bitmap of ymm to zero at purge + uint16_t ymm0_add; // the ymm0 added by the opcode + uint16_t ymm0_sub; // the ymm0 removed by the opcode + uint16_t ymm0_out; // the ymm0 at the end of the opcode + uint16_t ymm0_pass2, ymm0_pass3; + uint8_t barrier_maybe; + uint8_t will_write:2; // [strongmem] will write to memory + uint8_t will_read:1; // [strongmem] will read from memory + uint8_t last_write:1; // [strongmem] the last write in a SEQ + uint8_t lock:1; // [strongmem] lock semantic + uint8_t df_notneeded:1; + uint8_t nat_flags_fusion:1; + uint8_t nat_flags_nofusion:1; + uint8_t nat_flags_carry:1; + uint8_t nat_flags_sign:1; + uint8_t nat_flags_sf:1; + uint8_t nat_flags_needsign:1; + uint8_t no_scratch_usage:1; // this opcode does not use scratch register + uint8_t nat_flags_op1; + uint8_t nat_flags_op2; + uint8_t unaligned:1; // this opcode can be re-generated for unaligned special case + uint8_t x87precision:1; // this opcode can handle x87pc + unsigned mmx_used:1; // no fine tracking, just a global "any reg used" + unsigned x87_used:1; // no fine tracking, just a global "any reg used" + unsigned fpu_used:1; // any xmm/ymm/x87/mmx reg used + unsigned fpupurge:1; // this opcode will purge all fpu regs + uint16_t nat_next_inst; + flagcache_t f_exit; // flags status at end of instruction + vmxcache_t v; // vmxcache at end of instruction (but before poping) + flagcache_t f_entry; // flags status before the instruction begin + int cacheupd; +} instruction_ppc64le_t; + +typedef struct dynarec_ppc64le_s { + instruction_ppc64le_t* insts; + int32_t size; + int32_t cap; + uintptr_t start; // start of the block + uintptr_t end; // maximum end of the block (only used in pass0) + uint32_t isize; // size in bytes of x64 instructions included + uint32_t prefixsize; // size in byte of the prefix of the block + void* block; // memory pointer where next instruction is emitted + uintptr_t native_start; // start of the ppc64le code + size_t native_size; // size of emitted ppc64le code + uintptr_t last_ip; // last set IP in RIP (or NULL if unclean state) + uint64_t* table64; // table of 64bits values + int table64size;// size of table (will be appended at end of executable code) + int table64cap; + uintptr_t tablestart; + uintptr_t jmp_next; // address of the jump_next address + flagcache_t f; + vmxcache_t v; // cache for VMX regs from fpu, plus x87 stack delta + uintptr_t* next; // variable array of "next" jump address + int next_sz; + int next_cap; + int* jmps; // variable array of jump instructions + int jmp_sz; + int jmp_cap; + int* predecessor;// single array of all predecessor + dynablock_t* dynablock; + instsize_t* instsize; + size_t insts_size; // size of the instruction size array (calculated) + int callret_size; // size of the array + int sep_size; // size of the array + callret_t* callrets; // array of callret return, with NOP / UDF depending if the block is clean or dirty + sep_t* sep; // array of secondary entry point + uintptr_t forward; // address of the last end of code while testing forward + uintptr_t forward_to; // address of the next jump to (to check if everything is ok) + int32_t forward_size; // size at the forward point + int forward_ninst; // ninst at the forward point + uint16_t ymm_zero; // bitmap of ymm to zero at purge + uint8_t smwrite; // for strongmem model emulation + uint8_t doublepush; + uint8_t doublepop; + uint8_t always_test; + uint8_t abort; // abort the creation of the block + uint8_t use_x87:1; // set if x87 regs are used + uint8_t use_mmx:1; + uint8_t use_xmm:1; + uint8_t use_ymm:1; + uint8_t have_purge:1; // set to 1 if block can be purged + void* gdbjit_block; + uint32_t need_x87check; // needs x87 precision control check if non-null, or 0 if not + uint32_t need_dump; // need to dump the block + int need_reloc; // does the dynablock need relocations + int reloc_size; + uint32_t* relocs; + box64env_t* env; +} dynarec_ppc64le_t; + +void add_next(dynarec_ppc64le_t *dyn, uintptr_t addr); +uintptr_t get_closest_next(dynarec_ppc64le_t *dyn, uintptr_t addr); +void add_jump(dynarec_ppc64le_t *dyn, int ninst); +int get_first_jump(dynarec_ppc64le_t *dyn, int next); +int get_first_jump_addr(dynarec_ppc64le_t *dyn, uintptr_t next); +int is_nops(dynarec_ppc64le_t *dyn, uintptr_t addr, int n); + +int isTable64(dynarec_ppc64le_t *dyn, uint64_t val); // return 1 if val already in Table64 +int Table64(dynarec_ppc64le_t *dyn, uint64_t val, int pass); // add a value to table64 (if needed) and gives back the offset to use + +void CreateJmpNext(void* addr, void* next); + +#define GO_TRACE(A, B, s0) \ + GETIP(addr, s0); \ + MV(x1, xRIP); \ + STORE_XEMU_CALL(); \ + MOV64x(x2, B); \ + CALL(const_##A, -1, x1, x2); \ + LOAD_XEMU_CALL() + +#endif //__DYNAREC_PPC64LE_PRIVATE_H_ diff --git a/src/dynarec/ppc64le/ppc64le_emitter.h b/src/dynarec/ppc64le/ppc64le_emitter.h new file mode 100644 index 0000000000..fbbfee7ad1 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_emitter.h @@ -0,0 +1,1704 @@ +#ifndef __PPC64LE_EMITTER_H__ +#define __PPC64LE_EMITTER_H__ +/* + PPC64LE Emitter + + PowerPC64 Little-Endian instruction encoding macros. + Target ISA: POWER9 (ISA 3.0) minimum. + + Instruction formats: + - I-form: OPCD | LI(24) | AA | LK (unconditional branch) + - B-form: OPCD | BO(5) | BI(5) | BD(14) | AA | LK (conditional branch) + - D-form: OPCD | RT(5) | RA(5) | D/SI(16) (load/store, addi, etc.) + - DS-form: OPCD | RT(5) | RA(5) | DS(14) | XO(2) (ld, std, lwa) + - X-form: OPCD | RT(5) | RA(5) | RB(5) | XO(10) | Rc (indexed load/store, ALU) + - XO-form: OPCD | RT(5) | RA(5) | RB(5) | OE | XO(9) | Rc (arithmetic) + - M-form: OPCD | RS(5) | RA(5) | RB/SH(5) | MB(5) | ME(5) | Rc (rotate 32) + - MD-form: OPCD | RS(5) | RA(5) | sh(5) | mb/me(6) | XO(3) | sh(1) | Rc (rotate 64) + - XFX-form: OPCD | RT(5) | spr(10) | XO(10) (mtspr, mfspr) + - VX-form: OPCD(6) | VRT(5) | VRA(5) | VRB(5) | XO(11) (VMX) + + Note: PPC is big-endian at the instruction level — bit 0 is MSB. + All encoding macros produce a uint32_t in native (little-endian) byte order + since the assembler/CPU handles instruction endianness transparently in LE mode. +*/ + +#include "ppc64le_mapping.h" + +// =========================================================================== +// Helper to load a 64-bit constant (up to 5 instructions) +// =========================================================================== +#define MOV64x(Rd, imm64) ppc64le_move64(dyn, ninst, Rd, (uint64_t)(imm64)) +#define MOV32w(Rd, imm32) ppc64le_move32(dyn, ninst, Rd, (uint32_t)(imm32), 1) +#define MOV64xw(Rd, imm64) if(rex.w) {MOV64x(Rd, imm64);} else {MOV32w(Rd, (uint32_t)(imm64));} +#define MOV64z(Rd, imm64) if(rex.is32bits) {MOV32w(Rd, (uint32_t)(imm64));} else {MOV64x(Rd, imm64);} + +// TABLE64 is defined per-pass in pass0.h/pass1.h/pass2.h/pass3.h and helper.h fallback +// Do NOT define it here — each pass has a different implementation + +// =========================================================================== +// Instruction Encoding Macros — I-form (unconditional branch) +// =========================================================================== +// I-form: OPCD(6) | LI(24) | AA(1) | LK(1) +// LI is a signed 24-bit value, shifted left 2 to give ±32MB range +#define I_gen(opcd, li, aa, lk) \ + ((uint32_t)(opcd) << 26 | ((li) & 0x00FFFFFF) << 2 | ((aa) & 1) << 1 | ((lk) & 1)) + +// B — branch (relative) +#define B(offset) EMIT(I_gen(18, ((offset) >> 2) & 0x00FFFFFF, 0, 0)) +// BA — branch absolute +#define BA(addr) EMIT(I_gen(18, ((addr) >> 2) & 0x00FFFFFF, 1, 0)) +// BL — branch and link (relative, for calls) +#define BL(offset) EMIT(I_gen(18, ((offset) >> 2) & 0x00FFFFFF, 0, 1)) +// BLA — branch and link absolute +#define BLA(addr) EMIT(I_gen(18, ((addr) >> 2) & 0x00FFFFFF, 1, 1)) + +// =========================================================================== +// B-form (conditional branch) +// =========================================================================== +// B-form: OPCD(6) | BO(5) | BI(5) | BD(14) | AA(1) | LK(1) +// BD is a signed 14-bit value, shifted left 2 to give ±32KB range +#define B_form_gen(opcd, bo, bi, bd, aa, lk) \ + ((uint32_t)(opcd) << 26 | ((bo) & 0x1F) << 21 | ((bi) & 0x1F) << 16 | (((bd) >> 2) & 0x3FFF) << 2 | ((aa) & 1) << 1 | ((lk) & 1)) + +// BC — branch conditional (relative) +#define BC(bo, bi, offset) EMIT(B_form_gen(16, bo, bi, offset, 0, 0)) +// BCL — branch conditional and link +#define BCL(bo, bi, offset) EMIT(B_form_gen(16, bo, bi, offset, 0, 1)) + +// Simplified conditional branches using CR0 +// BEQ: branch if CR0[EQ] is set +#define BEQ(offset) BC(BO_TRUE, BI(CR0, CR_EQ), offset) +// BNE: branch if CR0[EQ] is clear +#define BNE(offset) BC(BO_FALSE, BI(CR0, CR_EQ), offset) +// BLT: branch if CR0[LT] is set +#define BLT(offset) BC(BO_TRUE, BI(CR0, CR_LT), offset) +// BGE: branch if CR0[LT] is clear +#define BGE(offset) BC(BO_FALSE, BI(CR0, CR_LT), offset) +// BGT: branch if CR0[GT] is set +#define BGT(offset) BC(BO_TRUE, BI(CR0, CR_GT), offset) +// BLE: branch if CR0[GT] is clear +#define BLE(offset) BC(BO_FALSE, BI(CR0, CR_GT), offset) + +// Conditional branches using arbitrary CR field +#define BEQ_CR(cr, offset) BC(BO_TRUE, BI(cr, CR_EQ), offset) +#define BNE_CR(cr, offset) BC(BO_FALSE, BI(cr, CR_EQ), offset) +#define BLT_CR(cr, offset) BC(BO_TRUE, BI(cr, CR_LT), offset) +#define BGE_CR(cr, offset) BC(BO_FALSE, BI(cr, CR_LT), offset) +#define BGT_CR(cr, offset) BC(BO_TRUE, BI(cr, CR_GT), offset) +#define BLE_CR(cr, offset) BC(BO_FALSE, BI(cr, CR_GT), offset) + +// =========================================================================== +// XL-form (branch to LR/CTR) +// =========================================================================== +// XL-form: OPCD(6) | BO(5) | BI(5) | 000(3) | BH(2) | XO(10) | LK(1) +#define XL_gen(opcd, bo, bi, bh, xo, lk) \ + ((uint32_t)(opcd) << 26 | ((bo) & 0x1F) << 21 | ((bi) & 0x1F) << 16 | ((bh) & 0x3) << 11 | ((xo) & 0x3FF) << 1 | ((lk) & 1)) + +// BCLR — branch conditional to LR +#define BCLR(bo, bi, bh) EMIT(XL_gen(19, bo, bi, bh, 16, 0)) +// BCLRL — branch conditional to LR and link +#define BCLRL(bo, bi, bh) EMIT(XL_gen(19, bo, bi, bh, 16, 1)) +// BLR — branch to LR (unconditional) +#define BLR() BCLR(BO_ALWAYS, 0, 0) +// BLRL — branch to LR and link +#define BLRL() BCLRL(BO_ALWAYS, 0, 0) + +// BCCTR — branch conditional to CTR +#define BCCTR(bo, bi, bh) EMIT(XL_gen(19, bo, bi, bh, 528, 0)) +// BCCTRL — branch conditional to CTR and link +#define BCCTRL(bo, bi, bh) EMIT(XL_gen(19, bo, bi, bh, 528, 1)) +// BCTR — branch to CTR (unconditional) +#define BCTR() BCCTR(BO_ALWAYS, 0, 0) +// BCTRL — branch to CTR and link +#define BCTRL() BCCTRL(BO_ALWAYS, 0, 0) + +// Conditional branches to LR +#define BEQLR() BCLR(BO_TRUE, BI(CR0, CR_EQ), 0) +#define BNELR() BCLR(BO_FALSE, BI(CR0, CR_EQ), 0) +#define BLTLR() BCLR(BO_TRUE, BI(CR0, CR_LT), 0) +#define BGELR() BCLR(BO_FALSE, BI(CR0, CR_LT), 0) +#define BGTLR() BCLR(BO_TRUE, BI(CR0, CR_GT), 0) +#define BLELR() BCLR(BO_FALSE, BI(CR0, CR_GT), 0) + +// =========================================================================== +// D-form (load/store with 16-bit displacement, addi, etc.) +// =========================================================================== +// D-form: OPCD(6) | RT/RS(5) | RA(5) | D/SI(16) +#define D_form_gen(opcd, rt, ra, d) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((d) & 0xFFFF)) + +// ADDI — RT = RA + SI (if RA=0, RT = SI) +#define ADDI(Rt, Ra, si) EMIT(D_form_gen(14, Rt, Ra, (si) & 0xFFFF)) +// LI — load immediate (pseudo: addi Rt, 0, si) +#define LI(Rt, si) ADDI(Rt, 0, si) +// ADDIS — RT = RA + (SI << 16) +#define ADDIS(Rt, Ra, si) EMIT(D_form_gen(15, Rt, Ra, (si) & 0xFFFF)) +// LIS — load immediate shifted (pseudo: addis Rt, 0, si) +#define LIS(Rt, si) ADDIS(Rt, 0, si) + +// Arithmetic immediate +// ADDIC — RT = RA + SI, sets CA (XER carry) +#define ADDIC(Rt, Ra, si) EMIT(D_form_gen(12, Rt, Ra, (si) & 0xFFFF)) +// ADDIC. — RT = RA + SI, sets CA and CR0 +#define ADDICd(Rt, Ra, si) EMIT(D_form_gen(13, Rt, Ra, (si) & 0xFFFF)) +// SUBFIC — RT = SI - RA (note: operand order!) +#define SUBFIC(Rt, Ra, si) EMIT(D_form_gen(8, Rt, Ra, (si) & 0xFFFF)) + +// Compare immediate +// CMPI — compare RA with SI (signed), result in CR field +// D-form with BF(3) | 0 | L(1) in RT field +#define CMPI_gen(bf, l, ra, si) \ + D_form_gen(11, ((bf) & 0x7) << 2 | ((l) & 1), ra, (si) & 0xFFFF) +#define CMPWI(Ra, si) EMIT(CMPI_gen(0, 0, Ra, si)) +#define CMPWI_CR(cr, Ra, si) EMIT(CMPI_gen(cr, 0, Ra, si)) +#define CMPDI(Ra, si) EMIT(CMPI_gen(0, 1, Ra, si)) +#define CMPDI_CR(cr, Ra, si) EMIT(CMPI_gen(cr, 1, Ra, si)) + +// CMPLI — compare RA with UI (unsigned), result in CR field +#define CMPLI_gen(bf, l, ra, ui) \ + D_form_gen(10, ((bf) & 0x7) << 2 | ((l) & 1), ra, (ui) & 0xFFFF) +#define CMPLWI(Ra, ui) EMIT(CMPLI_gen(0, 0, Ra, ui)) +#define CMPLWI_CR(cr, Ra, ui) EMIT(CMPLI_gen(cr, 0, Ra, ui)) +#define CMPLDI(Ra, ui) EMIT(CMPLI_gen(0, 1, Ra, ui)) +#define CMPLDI_CR(cr, Ra, ui) EMIT(CMPLI_gen(cr, 1, Ra, ui)) + +// Logical immediate +// ORI — RA = RS | UI +#define ORI(Ra, Rs, ui) EMIT(D_form_gen(24, Rs, Ra, (ui) & 0xFFFF)) +// ORIS — RA = RS | (UI << 16) +#define ORIS(Ra, Rs, ui) EMIT(D_form_gen(25, Rs, Ra, (ui) & 0xFFFF)) +// NOP — ori 0,0,0 +#define NOP() ORI(0, 0, 0) +// XORI — RA = RS ^ UI +#define XORI(Ra, Rs, ui) EMIT(D_form_gen(26, Rs, Ra, (ui) & 0xFFFF)) +// XORIS — RA = RS ^ (UI << 16) +#define XORIS(Ra, Rs, ui) EMIT(D_form_gen(27, Rs, Ra, (ui) & 0xFFFF)) +// ANDI. — RA = RS & UI, sets CR0 +#define ANDId(Ra, Rs, ui) EMIT(D_form_gen(28, Rs, Ra, (ui) & 0xFFFF)) +// ANDI alias (LA64 compat) — note: PPC64LE andi. always sets CR0 +#define ANDI(Ra, Rs, ui) ANDId(Ra, Rs, ui) +// ANDIS. — RA = RS & (UI << 16), sets CR0 +#define ANDISd(Ra, Rs, ui) EMIT(D_form_gen(29, Rs, Ra, (ui) & 0xFFFF)) + +// Load byte/halfword/word (D-form) +// LBZ — load byte and zero +#define LBZ(Rt, offset, Ra) EMIT(D_form_gen(34, Rt, Ra, (offset) & 0xFFFF)) +// LBZU — load byte and zero with update +#define LBZU(Rt, offset, Ra) EMIT(D_form_gen(35, Rt, Ra, (offset) & 0xFFFF)) +// LHZ — load halfword and zero +#define LHZ(Rt, offset, Ra) EMIT(D_form_gen(40, Rt, Ra, (offset) & 0xFFFF)) +// LHA — load halfword algebraic (sign-extend) +#define LHA(Rt, offset, Ra) EMIT(D_form_gen(42, Rt, Ra, (offset) & 0xFFFF)) +// LWZ — load word and zero +#define LWZ(Rt, offset, Ra) EMIT(D_form_gen(32, Rt, Ra, (offset) & 0xFFFF)) + +// Store byte/halfword/word (D-form) +// STB — store byte +#define STB(Rs, offset, Ra) EMIT(D_form_gen(38, Rs, Ra, (offset) & 0xFFFF)) +// STBU — store byte with update +#define STBU(Rs, offset, Ra) EMIT(D_form_gen(39, Rs, Ra, (offset) & 0xFFFF)) +// STH — store halfword +#define STH(Rs, offset, Ra) EMIT(D_form_gen(44, Rs, Ra, (offset) & 0xFFFF)) +// STW — store word +#define STW(Rs, offset, Ra) EMIT(D_form_gen(36, Rs, Ra, (offset) & 0xFFFF)) +// STWU — store word with update +#define STWU(Rs, offset, Ra) EMIT(D_form_gen(37, Rs, Ra, (offset) & 0xFFFF)) + +// Load/store word algebraic (D-form, opcode 58/62) +// These use DS-form (14-bit displacement, must be multiple of 4) + +// =========================================================================== +// DS-form (load/store doubleword, load word algebraic) +// =========================================================================== +// DS-form: OPCD(6) | RT/RS(5) | RA(5) | DS(14) | XO(2) +#define DS_form_gen(opcd, rt, ra, ds, xo) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((ds) & 0xFFFC) | ((xo) & 0x3)) + +// LD — load doubleword +#define LD(Rt, offset, Ra) EMIT(DS_form_gen(58, Rt, Ra, (offset) & 0xFFFC, 0)) +// LDU — load doubleword with update +#define LDU(Rt, offset, Ra) EMIT(DS_form_gen(58, Rt, Ra, (offset) & 0xFFFC, 1)) +// LWA — load word algebraic (sign-extend word to doubleword) +#define LWA(Rt, offset, Ra) EMIT(DS_form_gen(58, Rt, Ra, (offset) & 0xFFFC, 2)) + +// STD — store doubleword +#define STD(Rs, offset, Ra) EMIT(DS_form_gen(62, Rs, Ra, (offset) & 0xFFFC, 0)) +// STDU — store doubleword with update +#define STDU(Rs, offset, Ra) EMIT(DS_form_gen(62, Rs, Ra, (offset) & 0xFFFC, 1)) + +// =========================================================================== +// X-form (indexed load/store, register ALU, compare) +// =========================================================================== +// X-form: OPCD(6) | RT/RS/BF(5) | RA(5) | RB(5) | XO(10) | Rc(1) +#define X_form_gen(opcd, rt, ra, rb, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((rb) & 0x1F) << 11 | ((xo) & 0x3FF) << 1 | ((rc) & 1)) + +// Compare register +#define CMP_gen(bf, l, ra, rb) X_form_gen(31, ((bf) & 0x7) << 2 | ((l) & 1), ra, rb, 0, 0) +#define CMPW(Ra, Rb) EMIT(CMP_gen(0, 0, Ra, Rb)) +#define CMPW_CR(cr, Ra, Rb) EMIT(CMP_gen(cr, 0, Ra, Rb)) +#define CMPD(Ra, Rb) EMIT(CMP_gen(0, 1, Ra, Rb)) +#define CMPD_CR(cr, Ra, Rb) EMIT(CMP_gen(cr, 1, Ra, Rb)) + +#define CMPL_gen(bf, l, ra, rb) X_form_gen(31, ((bf) & 0x7) << 2 | ((l) & 1), ra, rb, 32, 0) +#define CMPLW(Ra, Rb) EMIT(CMPL_gen(0, 0, Ra, Rb)) +#define CMPLW_CR(cr, Ra, Rb) EMIT(CMPL_gen(cr, 0, Ra, Rb)) +#define CMPLD(Ra, Rb) EMIT(CMPL_gen(0, 1, Ra, Rb)) +#define CMPLD_CR(cr, Ra, Rb) EMIT(CMPL_gen(cr, 1, Ra, Rb)) + +// Indexed load/store +// LBZX — load byte and zero indexed +#define LBZX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 87, 0)) +// LHZX — load halfword and zero indexed +#define LHZX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 279, 0)) +// LHAX — load halfword algebraic indexed +#define LHAX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 343, 0)) +// LWZX — load word and zero indexed +#define LWZX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 23, 0)) +// LWAX — load word algebraic indexed +#define LWAX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 341, 0)) +// LDX — load doubleword indexed +#define LDX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 21, 0)) + +// STBX — store byte indexed +#define STBX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 215, 0)) +// STHX — store halfword indexed +#define STHX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 407, 0)) +// STWX — store word indexed +#define STWX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 151, 0)) +// STDX — store doubleword indexed +#define STDX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 149, 0)) + +// Byte-reverse load/store +// LHBRX — load halfword byte-reverse indexed +#define LHBRX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 790, 0)) +// LWBRX — load word byte-reverse indexed +#define LWBRX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 534, 0)) +// LDBRX — load doubleword byte-reverse indexed +#define LDBRX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 532, 0)) +// STHBRX — store halfword byte-reverse indexed +#define STHBRX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 918, 0)) +// STWBRX — store word byte-reverse indexed +#define STWBRX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 662, 0)) +// STDBRX — store doubleword byte-reverse indexed +#define STDBRX(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 660, 0)) + +// Extend / count / population +// EXTSB — extend sign byte (Rc=0) +#define EXTSB(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 954, 0)) +// EXTSBd — extend sign byte (Rc=1, sets CR0) +#define EXTSBd(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 954, 1)) +// EXTSH — extend sign halfword +#define EXTSH(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 922, 0)) +// EXTSHd — extend sign halfword (Rc=1) +#define EXTSHd(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 922, 1)) +// EXTSW — extend sign word +#define EXTSW(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 986, 0)) +// EXTSWd — extend sign word (Rc=1) +#define EXTSWd(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 986, 1)) + +// CNTLZW — count leading zeros word +#define CNTLZW(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 26, 0)) +// CNTLZD — count leading zeros doubleword +#define CNTLZD(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 58, 0)) +// CNTTZW — count trailing zeros word (POWER9) +#define CNTTZW(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 538, 0)) +// CNTTZD — count trailing zeros doubleword (POWER9) +#define CNTTZD(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 570, 0)) +// POPCNTW — population count word +#define POPCNTW(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 378, 0)) +// POPCNTD — population count doubleword +#define POPCNTD(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 506, 0)) +// POPCNTB — population count bytes +#define POPCNTB(Ra, Rs) EMIT(X_form_gen(31, Rs, Ra, 0, 122, 0)) + +// Logical register (X-form, opcode 31) +// AND +#define AND(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 28, 0)) +#define ANDd(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 28, 1)) +// ANDC — and with complement +#define ANDC(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 60, 0)) +// OR +#define OR(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 444, 0)) +#define ORd(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 444, 1)) +// MR — move register (pseudo: or Ra, Rs, Rs) +#define MR(Ra, Rs) OR(Ra, Rs, Rs) +// MV — alias for MR +#define MV(Rd, Rs) MR(Rd, Rs) +// ORC — or with complement +#define ORC(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 412, 0)) +// XOR +#define XOR(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 316, 0)) +#define XORd(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 316, 1)) +// NAND +#define NAND(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 476, 0)) +// NOR +#define NOR(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 124, 0)) +// NOT — pseudo: nor Ra, Rs, Rs +#define NOT(Ra, Rs) NOR(Ra, Rs, Rs) +// EQV — equivalence (xnor) +#define EQV(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 284, 0)) + +// =========================================================================== +// XO-form (arithmetic register) +// =========================================================================== +// XO-form: OPCD(6) | RT(5) | RA(5) | RB(5) | OE(1) | XO(9) | Rc(1) +#define XO_form_gen(opcd, rt, ra, rb, oe, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((rb) & 0x1F) << 11 | ((oe) & 1) << 10 | ((xo) & 0x1FF) << 1 | ((rc) & 1)) + +// ADD — RT = RA + RB +#define ADD(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 266, 0)) +#define ADDd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 266, 1)) +#define ADDo(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 1, 266, 0)) + +// ADDC — RT = RA + RB, set CA +#define ADDC(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 10, 0)) +#define ADDCd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 10, 1)) + +// ADDE — RT = RA + RB + CA +#define ADDE(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 138, 0)) +#define ADDEd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 138, 1)) + +// ADDZE — RT = RA + CA +#define ADDZE(Rt, Ra) EMIT(XO_form_gen(31, Rt, Ra, 0, 0, 202, 0)) + +// SUBF — RT = RB - RA (subtract from: note operand order!) +#define SUBF(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 40, 0)) +#define SUBFd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 40, 1)) +// SUB — pseudo: RT = Ra - Rb = subf Rt, Rb, Ra +#define SUB(Rt, Ra, Rb) SUBF(Rt, Rb, Ra) +#define SUBd(Rt, Ra, Rb) SUBFd(Rt, Rb, Ra) + +// SUBFC — RT = RB - RA, set CA +#define SUBFC(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 8, 0)) +#define SUBFCd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 8, 1)) + +// SUBFE — RT = ~RA + RB + CA +#define SUBFE(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 136, 0)) +#define SUBFEd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 136, 1)) + +// SUBFZE — RT = ~RA + CA +#define SUBFZE(Rt, Ra) EMIT(XO_form_gen(31, Rt, Ra, 0, 0, 200, 0)) + +// NEG — RT = -RA +#define NEG(Rt, Ra) EMIT(XO_form_gen(31, Rt, Ra, 0, 0, 104, 0)) +#define NEGd(Rt, Ra) EMIT(XO_form_gen(31, Rt, Ra, 0, 0, 104, 1)) + +// MULLW — multiply low word +#define MULLW(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 235, 0)) +#define MULLWd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 235, 1)) +// MULHW — multiply high word (signed) +#define MULHW(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 75, 0)) +// MULHWU — multiply high word (unsigned) +#define MULHWU(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 11, 0)) +// MULLD — multiply low doubleword +#define MULLD(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 233, 0)) +#define MULLDd(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 233, 1)) +// MULHD — multiply high doubleword (signed) +#define MULHD(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 73, 0)) +// MULHDU — multiply high doubleword (unsigned) +#define MULHDU(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 9, 0)) + +// DIVW — divide word (signed) +#define DIVW(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 491, 0)) +// DIVWU — divide word (unsigned) +#define DIVWU(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 459, 0)) +// DIVD — divide doubleword (signed) +#define DIVD(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 489, 0)) +// DIVDU — divide doubleword (unsigned) +#define DIVDU(Rt, Ra, Rb) EMIT(XO_form_gen(31, Rt, Ra, Rb, 0, 457, 0)) +// MODUW — modulo unsigned word (POWER9) +#define MODUW(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 267, 0)) +// MODSW — modulo signed word (POWER9) +#define MODSW(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 779, 0)) +// MODUD — modulo unsigned doubleword (POWER9) +#define MODUD(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 265, 0)) +// MODSD — modulo signed doubleword (POWER9) +#define MODSD(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 777, 0)) + +// MADDLD — multiply-add low doubleword (POWER9, VA-form opcode 4, xo=51) +// RT = (RA * RB) + RC (low 64 bits) +#define VA_form_gen(opcd, rt, ra, rb, rc, xo) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((rb) & 0x1F) << 11 | ((rc) & 0x1F) << 6 | ((xo) & 0x3F)) +#define MADDLD(Rt, Ra, Rb, Rc) EMIT(VA_form_gen(4, Rt, Ra, Rb, Rc, 51)) + +// =========================================================================== +// Shift and Rotate instructions +// =========================================================================== + +// M-form (32-bit rotate): OPCD(6) | RS(5) | RA(5) | SH/RB(5) | MB(5) | ME(5) | Rc(1) +#define M_form_gen(opcd, rs, ra, sh, mb, me, rc) \ + ((uint32_t)(opcd) << 26 | ((rs) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((sh) & 0x1F) << 11 | ((mb) & 0x1F) << 6 | ((me) & 0x1F) << 1 | ((rc) & 1)) + +// RLWINM — rotate left word immediate then AND with mask +#define RLWINM(Ra, Rs, sh, mb, me) EMIT(M_form_gen(21, Rs, Ra, sh, mb, me, 0)) +#define RLWINMd(Ra, Rs, sh, mb, me) EMIT(M_form_gen(21, Rs, Ra, sh, mb, me, 1)) +// RLWIMI — rotate left word immediate then mask insert +#define RLWIMI(Ra, Rs, sh, mb, me) EMIT(M_form_gen(20, Rs, Ra, sh, mb, me, 0)) +// RLWNM — rotate left word then AND with mask (shift amount in RB) +#define RLWNM(Ra, Rs, Rb, mb, me) EMIT(M_form_gen(23, Rs, Ra, Rb, mb, me, 0)) + +// Shift word (pseudo-ops using rlwinm) +// SLWI — shift left word immediate: rlwinm Ra, Rs, n, 0, 31-n +#define SLWI(Ra, Rs, n) RLWINM(Ra, Rs, n, 0, 31-(n)) +// SRWI — shift right word immediate: rlwinm Ra, Rs, 32-n, n, 31 +#define SRWI(Ra, Rs, n) RLWINM(Ra, Rs, 32-(n), n, 31) +// CLRLWI — clear left n bits: rlwinm Ra, Rs, 0, n, 31 +#define CLRLWI(Ra, Rs, n) RLWINM(Ra, Rs, 0, n, 31) +// CLRRWI — clear right n bits: rlwinm Ra, Rs, 0, 0, 31-n +#define CLRRWI(Ra, Rs, n) RLWINM(Ra, Rs, 0, 0, 31-(n)) +// ROTLWI — rotate left word immediate: rlwinm Ra, Rs, n, 0, 31 +#define ROTLWI(Ra, Rs, n) RLWINM(Ra, Rs, n, 0, 31) +// ROTRWI — rotate right word immediate: rlwinm Ra, Rs, 32-n, 0, 31 +#define ROTRWI(Ra, Rs, n) RLWINM(Ra, Rs, 32-(n), 0, 31) + +// MD-form (64-bit rotate): OPCD(6) | RS(5) | RA(5) | sh[0:4](5) | mb/me(6) | XO(3) | sh[5](1) | Rc(1) +// sh is 6 bits: sh[5] is bit 1, sh[0:4] is bits 11-15 +// MD form: the 6-bit mb/me field is stored with a scrambled bit order. +// The hardware interprets it as b = mb[5] || mb[0:4], where mb[0] is at +// Power bit 21 (C bit 10, the MSB of the 6-bit field in the instruction) +// and mb[5] is at Power bit 26 (C bit 5, the LSB). +// So to encode logical value 'mbe', we must place mbe[0:4] in the upper 5 bits +// and mbe[5] in the lower bit: raw = ((mbe & 0x1F) << 1) | ((mbe >> 5) & 1) +#define MD_form_gen(opcd, rs, ra, sh5, mbe, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((rs) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | (((sh5) & 0x1F)) << 11 | ((((mbe) & 0x1F) << 1 | (((mbe) >> 5) & 1))) << 5 | ((xo) & 0x7) << 2 | ((((sh5) >> 5) & 1)) << 1 | ((rc) & 1)) + +// RLDICL — rotate left doubleword immediate then clear left +#define RLDICL(Ra, Rs, sh, mb) EMIT(MD_form_gen(30, Rs, Ra, (sh) & 0x3F, (mb) & 0x3F, 0, 0)) +#define RLDICLd(Ra, Rs, sh, mb) EMIT(MD_form_gen(30, Rs, Ra, (sh) & 0x3F, (mb) & 0x3F, 0, 1)) +// RLDICR — rotate left doubleword immediate then clear right +#define RLDICR(Ra, Rs, sh, me) EMIT(MD_form_gen(30, Rs, Ra, (sh) & 0x3F, (me) & 0x3F, 1, 0)) +// RLDIC — rotate left doubleword immediate then clear +#define RLDIC(Ra, Rs, sh, mb) EMIT(MD_form_gen(30, Rs, Ra, (sh) & 0x3F, (mb) & 0x3F, 2, 0)) +// RLDIMI — rotate left doubleword immediate then mask insert +#define RLDIMI(Ra, Rs, sh, mb) EMIT(MD_form_gen(30, Rs, Ra, (sh) & 0x3F, (mb) & 0x3F, 3, 0)) + +// MDS-form (doubleword rotate, variable shift amount in RB) +// MDS form: OPCD[0:5] | RS[6:10] | RA[11:15] | RB[16:20] | mb[21:26] | XO[27:30] | Rc[31] +// MDS form: same mb/me scrambling as MD form (6-bit field, b = mb[5]||mb[0:4]) +#define MDS_form_gen(opcd, rs, ra, rb, mbe, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((rs) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | ((rb) & 0x1F) << 11 | ((((mbe) & 0x1F) << 1 | (((mbe) >> 5) & 1))) << 5 | ((xo) & 0xF) << 1 | ((rc) & 1)) + +// RLDCL — rotate left doubleword then clear left (variable, Rb has rotate amount) +#define RLDCL(Ra, Rs, Rb, mb) EMIT(MDS_form_gen(30, Rs, Ra, Rb, (mb) & 0x3F, 8, 0)) +// RLDCR — rotate left doubleword then clear right (variable, Rb has rotate amount) +#define RLDCR(Ra, Rs, Rb, me) EMIT(MDS_form_gen(30, Rs, Ra, Rb, (me) & 0x3F, 9, 0)) + +// Shift doubleword (pseudo-ops using rldicl/rldicr) +// SLDI — shift left doubleword immediate: rldicr Ra, Rs, n, 63-n +#define SLDI(Ra, Rs, n) RLDICR(Ra, Rs, n, 63-(n)) +// SRDI — shift right doubleword immediate: rldicl Ra, Rs, 64-n, n +#define SRDI(Ra, Rs, n) RLDICL(Ra, Rs, 64-(n), n) +// CLRLDI — clear left n bits of doubleword: rldicl Ra, Rs, 0, n +#define CLRLDI(Ra, Rs, n) RLDICL(Ra, Rs, 0, n) +// ROTLDI — rotate left doubleword immediate: rldicl Ra, Rs, n, 0 +#define ROTLDI(Ra, Rs, n) RLDICL(Ra, Rs, n, 0) + +// Shift register (X-form, opcode 31) +// SLW — shift left word +#define SLW(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 24, 0)) +// SRW — shift right word +#define SRW(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 536, 0)) +// SRAW — shift right algebraic word +#define SRAW(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 792, 0)) +#define SRAWd(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 792, 1)) +// SRAWI — shift right algebraic word immediate +#define SRAWI(Ra, Rs, sh) EMIT(X_form_gen(31, Rs, Ra, sh, 824, 0)) +#define SRAWId(Ra, Rs, sh) EMIT(X_form_gen(31, Rs, Ra, sh, 824, 1)) +// SLD — shift left doubleword +#define SLD(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 27, 0)) +// SRD — shift right doubleword +#define SRD(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 539, 0)) +// SRAD — shift right algebraic doubleword +#define SRAD(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 794, 0)) +#define SRADd(Ra, Rs, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 794, 1)) + +// SRADI — shift right algebraic doubleword immediate (XS-form) +// XS-form: like X-form but sh is 6 bits: sh[0:4] in bits 11-15, sh[5] in bit 1 +#define XS_form_gen(opcd, rs, ra, sh5, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((rs) & 0x1F) << 21 | ((ra) & 0x1F) << 16 | (((sh5) & 0x1F)) << 11 | ((xo) & 0x1FF) << 2 | ((((sh5) >> 5) & 1)) << 1 | ((rc) & 1)) +#define SRADI(Ra, Rs, sh) EMIT(XS_form_gen(31, Rs, Ra, (sh) & 0x3F, 413, 0)) +#define SRADId(Ra, Rs, sh) EMIT(XS_form_gen(31, Rs, Ra, (sh) & 0x3F, 413, 1)) + +// EXTSWSLI — extend sign word and shift left immediate (POWER9, XS-form, xo=445) +#define EXTSWSLI(Ra, Rs, sh) EMIT(XS_form_gen(31, Rs, Ra, (sh) & 0x3F, 445, 0)) + +// =========================================================================== +// XFX-form (move to/from special registers) +// =========================================================================== +// XFX-form: OPCD(6) | RT(5) | spr(10) | XO(10) | 0 +// Note: spr field is swapped: spr[5:9] | spr[0:4] +#define XFX_form_gen(opcd, rt, spr, xo) \ + ((uint32_t)(opcd) << 26 | ((rt) & 0x1F) << 21 | ((spr) & 0x3FF) << 11 | ((xo) & 0x3FF) << 1) + +// SPR numbers (in the swapped encoding PPC uses: spr[5:9]||spr[0:4]) +#define SPR_LR (8 << 5 | 0) // LR = SPR 8, encoded as 0x100 +#define SPR_CTR (9 << 5 | 0) // CTR = SPR 9, encoded as 0x120 +#define SPR_XER (1 << 5 | 0) // XER = SPR 1, encoded as 0x020 +#define SPR_VRSAVE (0 << 5 | 8) // VRSAVE = SPR 256 + +// MFSPR — move from special purpose register +#define MFSPR(Rt, spr) EMIT(XFX_form_gen(31, Rt, spr, 339)) +// MTSPR — move to special purpose register +#define MTSPR(spr, Rs) EMIT(XFX_form_gen(31, Rs, spr, 467)) + +// MFLR — move from LR +#define MFLR(Rt) MFSPR(Rt, SPR_LR) +// MTLR — move to LR +#define MTLR(Rs) MTSPR(SPR_LR, Rs) +// MFCTR — move from CTR +#define MFCTR(Rt) MFSPR(Rt, SPR_CTR) +// MTCTR — move to CTR +#define MTCTR(Rs) MTSPR(SPR_CTR, Rs) +// MFXER — move from XER +#define MFXER(Rt) MFSPR(Rt, SPR_XER) +// MTXER — move to XER +#define MTXER(Rs) MTSPR(SPR_XER, Rs) +// MFTB — move from time base (SPR 268, reads TBR) +#define SPR_TB (8 << 5 | 12) // TBR = SPR 268 +#define MFTB(Rt) MFSPR(Rt, SPR_TB) + +// MFCR — move from condition register (X-form, xo=19) +#define MFCR(Rt) EMIT(X_form_gen(31, Rt, 0, 0, 19, 0)) +// MTCRF — move to condition register fields (XFX-form variant) +// FXM is 8-bit field mask: bit 0 = CR0, bit 7 = CR7 +#define MTCRF_gen(fxm, rs) \ + ((uint32_t)(31) << 26 | ((rs) & 0x1F) << 21 | ((fxm) & 0xFF) << 12 | (144) << 1) +#define MTCRF(fxm, Rs) EMIT(MTCRF_gen(fxm, Rs)) +// MTCR — move to full CR (pseudo: mtcrf 0xFF, Rs) +#define MTCR(Rs) MTCRF(0xFF, Rs) + +// MFOCRF — move from one condition register field +#define MFOCRF(Rt, fxm) \ + EMIT((uint32_t)(31) << 26 | ((Rt) & 0x1F) << 21 | 1 << 20 | ((fxm) & 0xFF) << 12 | (19) << 1) +// MTOCRF — move to one condition register field +#define MTOCRF(fxm, Rs) \ + EMIT((uint32_t)(31) << 26 | ((Rs) & 0x1F) << 21 | 1 << 20 | ((fxm) & 0xFF) << 12 | (144) << 1) + +// =========================================================================== +// CR logical operations (XL-form, opcode 19) +// =========================================================================== +// CRAND — CR[BT] = CR[BA] & CR[BB] +#define CRAND(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 257, 0)) +// CRANDC — CR[BT] = CR[BA] & ~CR[BB] +#define CRANDC(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 129, 0)) +// CROR — CR[BT] = CR[BA] | CR[BB] +#define CROR(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 449, 0)) +// CRORC — CR[BT] = CR[BA] | ~CR[BB] +#define CRORC(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 417, 0)) +// CRXOR — CR[BT] = CR[BA] ^ CR[BB] +#define CRXOR(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 193, 0)) +// CRNAND — CR[BT] = ~(CR[BA] & CR[BB]) +#define CRNAND(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 225, 0)) +// CRNOR — CR[BT] = ~(CR[BA] | CR[BB]) +#define CRNOR(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 33, 0)) +// CREQV — CR[BT] = ~(CR[BA] ^ CR[BB]) +#define CREQV(bt, ba, bb) EMIT(XL_gen(19, bt, ba, bb, 289, 0)) +// CRSET — set CR bit (pseudo: creqv bt, bt, bt) +#define CRSET(bt) CREQV(bt, bt, bt) +// CRCLR — clear CR bit (pseudo: crxor bt, bt, bt) +#define CRCLR(bt) CRXOR(bt, bt, bt) +// CRMOVE — copy CR bit (pseudo: cror bt, ba, ba) +#define CRMOVE(bt, ba) CROR(bt, ba, ba) +// CRNOT — complement CR bit (pseudo: crnor bt, ba, ba) +#define CRNOT(bt, ba) CRNOR(bt, ba, ba) + +// =========================================================================== +// Synchronization / Memory barriers +// =========================================================================== +// SYNC — full memory barrier (X-form, opcode 31, xo=598) +#define SYNC() EMIT(X_form_gen(31, 0, 0, 0, 598, 0)) +// LWSYNC — lightweight sync (load-load + store-store, like ARM64 dmb ish) +#define LWSYNC() EMIT(X_form_gen(31, 1, 0, 0, 598, 0)) +// ISYNC — instruction synchronization +#define ISYNC() EMIT(XL_gen(19, 0, 0, 0, 150, 0)) +// EIEIO — enforce in-order execution of I/O +#define EIEIO() EMIT(X_form_gen(31, 0, 0, 0, 854, 0)) + +// DMB compatibility macros (match ARM64/LA64/RV64 naming used by strong memory model code) +// PPC64LE mapping: LWSYNC covers load-load, load-store, store-store ordering +// SYNC (heavyweight) covers all ordering including store-load +#define DMB_ISH() LWSYNC() +#define DMB_ISHLD() LWSYNC() +#define DMB_ISHST() LWSYNC() + +// Cache management +// DCBST — data cache block store +#define DCBST(Ra, Rb) EMIT(X_form_gen(31, 0, Ra, Rb, 54, 0)) +// DCBF — data cache block flush +#define DCBF(Ra, Rb) EMIT(X_form_gen(31, 0, Ra, Rb, 86, 0)) +// ICBI — instruction cache block invalidate +#define ICBI(Ra, Rb) EMIT(X_form_gen(31, 0, Ra, Rb, 982, 0)) + +// =========================================================================== +// Load/store with reservation (atomics, LL/SC) +// =========================================================================== +// LBARX — load byte and reserve indexed (POWER8+) +#define LBARX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 52, 0)) +// LHARX — load halfword and reserve indexed (POWER8+) +#define LHARX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 116, 0)) +// LWARX — load word and reserve indexed +#define LWARX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 20, 0)) +// LDARX — load doubleword and reserve indexed +#define LDARX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 84, 0)) +// LQARX — load quadword and reserve indexed +#define LQARX(Rt, Ra, Rb) EMIT(X_form_gen(31, Rt, Ra, Rb, 276, 0)) + +// STBCX. — store byte conditional indexed (always Rc=1) +#define STBCXd(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 694, 1)) +// STHCX. — store halfword conditional indexed (always Rc=1) +#define STHCXd(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 726, 1)) +// STWCX. — store word conditional indexed (always Rc=1) +#define STWCXd(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 150, 1)) +// STDCX. — store doubleword conditional indexed (always Rc=1) +#define STDCXd(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 214, 1)) +// STQCX. — store quadword conditional indexed (always Rc=1) +#define STQCXd(Rs, Ra, Rb) EMIT(X_form_gen(31, Rs, Ra, Rb, 182, 1)) + +// =========================================================================== +// Floating-point load/store (D-form) +// =========================================================================== +// LFS — load floating-point single +#define LFS(Frt, offset, Ra) EMIT(D_form_gen(48, Frt, Ra, (offset) & 0xFFFF)) +// LFD — load floating-point double +#define LFD(Frt, offset, Ra) EMIT(D_form_gen(50, Frt, Ra, (offset) & 0xFFFF)) +// STFS — store floating-point single +#define STFS(Frs, offset, Ra) EMIT(D_form_gen(52, Frs, Ra, (offset) & 0xFFFF)) +// STFD — store floating-point double +#define STFD(Frs, offset, Ra) EMIT(D_form_gen(54, Frs, Ra, (offset) & 0xFFFF)) + +// Indexed floating-point load/store (X-form) +// LFSX — load floating-point single indexed +#define LFSX(Frt, Ra, Rb) EMIT(X_form_gen(31, Frt, Ra, Rb, 535, 0)) +// LFDX — load floating-point double indexed +#define LFDX(Frt, Ra, Rb) EMIT(X_form_gen(31, Frt, Ra, Rb, 599, 0)) +// STFSX — store floating-point single indexed +#define STFSX(Frs, Ra, Rb) EMIT(X_form_gen(31, Frs, Ra, Rb, 663, 0)) +// STFDX — store floating-point double indexed +#define STFDX(Frs, Ra, Rb) EMIT(X_form_gen(31, Frs, Ra, Rb, 727, 0)) + +// =========================================================================== +// VMX (Altivec/VSX) load/store — DQ-form (POWER9 lxv/stxv) +// =========================================================================== +// +// *** PPC64LE-SPECIFIC SILENT TRUNCATION HAZARD *** +// +// DQ-form instructions (LXV, STXV) encode the displacement as a 12-bit field +// that stores offset>>4 — only multiples of 16 are representable. The hardware +// SILENTLY TRUNCATES the low 4 bits of any non-aligned displacement: +// +// LXV vs0, 0x24(r1) → encodes as 0x20 → loads from r1+32, NOT r1+36 +// +// There is NO trap, NO exception, NO architectural indication of the truncation. +// This is a PPC64LE-specific quirk not present in other box64 backend ISAs: +// - ARM64: LDR Qn uses scaled 12-bit unsigned offsets (assembler rejects misalignment) +// - LA64: VLD/VST use byte-addressable 12-bit signed immediates (any offset valid) +// - RV64: No similar constraint +// +// MITIGATION: +// All callers that pass a guest-derived displacement to LXV/STXV must ensure +// 16-byte alignment. In the dynarec, this is enforced by passing DQ_ALIGN|1 +// (defined in dynarec_ppc64le_helper.h) to geted(), which sets align_mask=15 +// and forces non-16-byte-aligned displacements to be materialized in a register +// instead of used as an inline immediate. +// +// Safe callers that do NOT need DQ_ALIGN: +// - offsetof(x64emu_t, xmm[n]) — struct offsets, always 16-byte aligned +// - Literal 0 displacement — trivially aligned +// +// If you add a new LXV/STXV callsite with a displacement derived from geted(), +// you MUST pass DQ_ALIGN|1 to geted(). Failure to do so will cause silent data +// corruption that is extremely difficult to diagnose (wrong values, not crashes). +// +// DQ-form: OPCD(6) | TX||T(5) | RA(5) | DQ(12) | XO(4) +// TX is the high bit of the 6-bit target register, T is the low 5 bits +#define DQ_form_gen(opcd, rt6, ra, dq, xo) \ + ((uint32_t)(opcd) << 26 | (((rt6) & 0x1F)) << 21 | ((ra) & 0x1F) << 16 | (((dq) >> 4) & 0xFFF) << 4 | ((xo) & 0xF) | ((((rt6) >> 5) & 1) << 3 & 0x8)) + +// LXV — load VSX vector (16 bytes, DQ-form, POWER9) +// Note: target is vs0-vs63, DQ must be multiple of 16 (see truncation hazard above) +// Using opcode 61, xo = 1 for lxv +#define LXV(Vrt, offset, Ra) \ + do { \ + if (((offset) & 0xF) != 0) \ + dynarec_log(LOG_NONE, "BUG: LXV displacement 0x%lx not 16-byte aligned at %s:%d\n", \ + (unsigned long)(offset), __FILE__, __LINE__); \ + EMIT(DQ_form_gen(61, Vrt, Ra, offset, 1)); \ + } while(0) +// STXV — store VSX vector (16 bytes, DQ-form, POWER9) +// Note: source is vs0-vs63, DQ must be multiple of 16 (see truncation hazard above) +// Using opcode 61, xo = 5 for stxv +#define STXV(Vrs, offset, Ra) \ + do { \ + if (((offset) & 0xF) != 0) \ + dynarec_log(LOG_NONE, "BUG: STXV displacement 0x%lx not 16-byte aligned at %s:%d\n", \ + (unsigned long)(offset), __FILE__, __LINE__); \ + EMIT(DQ_form_gen(61, Vrs, Ra, offset, 5)); \ + } while(0) + +// Indexed VMX load/store (XX1-form, opcode 31 — TX bit in bit 0) +// LXVX — load VSX vector indexed (POWER9, supports vs0-vs63) +#define LXVX(Vrt, Ra, Rb) EMIT(XX1_form_gen(31, Vrt, Ra, Rb, 268)) +// STXVX — store VSX vector indexed (POWER9, supports vs0-vs63) +#define STXVX(Vrs, Ra, Rb) EMIT(XX1_form_gen(31, Vrs, Ra, Rb, 396)) + +// LVX — load vector indexed (VMX, old-style 128-bit load) +#define LVX(Vrt, Ra, Rb) EMIT(X_form_gen(31, Vrt, Ra, Rb, 103, 0)) +// STVX — store vector indexed (VMX, old-style 128-bit store) +#define STVX(Vrs, Ra, Rb) EMIT(X_form_gen(31, Vrs, Ra, Rb, 231, 0)) + +// =========================================================================== +// VMX (Altivec) integer SIMD instructions — VX-form (opcode 4) +// =========================================================================== +// VX-form: OPCD(6) | VRT(5) | VRA(5) | VRB(5) | XO(11) +// NOTE: These operate on VMX registers vr0-vr31. In our backend, the vmxcache +// maps XMM/x87/MMX slots to VSX vs0-vs31 (which overlap with FPR f0-f31). +// VMX vr0-vr31 are VSX vs32-vs63. Since our SSE cache uses vs0-vs31 (FPR space), +// we generally use VSX instructions (XXLOR, etc.) for 128-bit ops. +// However, VMX integer SIMD instructions (VADDUBM, VCMPEQUB, etc.) only work +// on VMX regs (vr0-vr31 = vs32-vs63). To use them with our vs0-vs31 cache, +// we need XXLOR to move data to/from vr scratch regs, or use the VSX equivalents +// where available. +// +// For POWER9+, many packed integer ops are available via VSX instructions +// that can operate on the full vs0-vs63 range. We prefer those. +// +// VX-form generator (all use primary opcode 4): +#define VX_form_gen(vrt, vra, vrb, xo) \ + ((uint32_t)(4) << 26 | ((vrt) & 0x1F) << 21 | ((vra) & 0x1F) << 16 | ((vrb) & 0x1F) << 11 | ((xo) & 0x7FF)) + +// --- Integer Add --- +// VADDUBM — vector add unsigned byte modulo: VRT = VRA + VRB (byte) +#define VADDUBM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 0)) +// VADDUHM — vector add unsigned halfword modulo: VRT = VRA + VRB (halfword) +#define VADDUHM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 64)) +// VADDUWM — vector add unsigned word modulo: VRT = VRA + VRB (word) +#define VADDUWM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 128)) +// VADDUDM — vector add unsigned doubleword modulo: VRT = VRA + VRB (dword) +#define VADDUDM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 192)) + +// --- Integer Add Saturate --- +// VADDSBS — vector add signed byte saturate +#define VADDSBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 768)) +// VADDSHS — vector add signed halfword saturate +#define VADDSHS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 832)) +// VADDSWS — vector add signed word saturate +#define VADDSWS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 896)) +// VADDUBS — vector add unsigned byte saturate +#define VADDUBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 512)) +// VADDUHS — vector add unsigned halfword saturate +#define VADDUHS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 576)) + +// --- Integer Subtract --- +// VSUBUBM — vector subtract unsigned byte modulo +#define VSUBUBM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1024)) +// VSUBUHM — vector subtract unsigned halfword modulo +#define VSUBUHM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1088)) +// VSUBUWM — vector subtract unsigned word modulo +#define VSUBUWM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1152)) +// VSUBUDM — vector subtract unsigned doubleword modulo +#define VSUBUDM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1216)) + +// --- Integer Subtract Saturate --- +// VSUBSBS — vector subtract signed byte saturate +#define VSUBSBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1792)) +// VSUBSHS — vector subtract signed halfword saturate +#define VSUBSHS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1856)) +// VSUBUBS — vector subtract unsigned byte saturate +#define VSUBUBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1536)) +// VSUBUHS — vector subtract unsigned halfword saturate +#define VSUBUHS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1600)) + +// --- Integer Compare Equal --- +// VCMPEQUB — vector compare equal unsigned byte (result is all-1s or all-0s per element) +#define VCMPEQUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 6)) +// VCMPEQUH — vector compare equal unsigned halfword +#define VCMPEQUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 70)) +// VCMPEQUW — vector compare equal unsigned word +#define VCMPEQUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 134)) +// VCMPEQUD — vector compare equal unsigned doubleword (POWER8) +#define VCMPEQUD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 199)) + +// --- Integer Compare Greater Than (signed) --- +// VCMPGTSB — vector compare greater than signed byte +#define VCMPGTSB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 774)) +// VCMPGTSH — vector compare greater than signed halfword +#define VCMPGTSH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 838)) +// VCMPGTSW — vector compare greater than signed word +#define VCMPGTSW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 902)) +// VCMPGTSD — vector compare greater than signed doubleword (POWER8) +#define VCMPGTSD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 967)) + +// --- Integer Compare Greater Than (unsigned) --- +// VCMPGTUB — vector compare greater than unsigned byte +#define VCMPGTUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 518)) +// VCMPGTUH — vector compare greater than unsigned halfword +#define VCMPGTUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 582)) +// VCMPGTUW — vector compare greater than unsigned word +#define VCMPGTUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 646)) +// VCMPGTUD — vector compare greater than unsigned doubleword +#define VCMPGTUD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 711)) + +// --- Integer Min/Max --- +// VMINUB — vector minimum unsigned byte +#define VMINUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 514)) +// VMINUH — vector minimum unsigned halfword +#define VMINUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 578)) +// VMINSB — vector minimum signed byte +#define VMINSB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 770)) +// VMINSH — vector minimum signed halfword +#define VMINSH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 834)) +// VMINSW — vector minimum signed word +#define VMINSW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 898)) +// VMINUW — vector minimum unsigned word +#define VMINUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 642)) +// VMINSD — vector minimum signed doubleword (POWER8) +#define VMINSD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 962)) +// VMINUD — vector minimum unsigned doubleword (POWER8) +#define VMINUD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 706)) +// VMAXUB — vector maximum unsigned byte +#define VMAXUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 2)) +// VMAXUH — vector maximum unsigned halfword +#define VMAXUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 66)) +// VMAXSB — vector maximum signed byte +#define VMAXSB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 258)) +// VMAXSH — vector maximum signed halfword +#define VMAXSH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 322)) +// VMAXSW — vector maximum signed word +#define VMAXSW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 386)) +// VMAXUW — vector maximum unsigned word +#define VMAXUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 130)) +// VMAXSD — vector maximum signed doubleword (POWER8) +#define VMAXSD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 450)) +// VMAXUD — vector maximum unsigned doubleword (POWER8) +#define VMAXUD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 194)) + +// --- Integer Multiply --- +// VMULESB — vector multiply even signed byte (produces halfwords) +#define VMULESB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 776)) +// VMULEUB — vector multiply even unsigned byte (produces halfwords) +#define VMULEUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 520)) +// VMULOSB — vector multiply odd signed byte (produces halfwords) +#define VMULOSB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 264)) +// VMULOUB — vector multiply odd unsigned byte (produces halfwords) +#define VMULOUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 8)) +// VMULESH — vector multiply even signed halfword (produces words) +#define VMULESH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 840)) +// VMULEUH — vector multiply even unsigned halfword (produces words) +#define VMULEUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 584)) +// VMULOSH — vector multiply odd signed halfword (produces words) +#define VMULOSH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 328)) +// VMULOUH — vector multiply odd unsigned halfword (produces words) +#define VMULOUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 72)) +// VMULESW — vector multiply even signed word (produces doublewords, POWER8) +#define VMULESW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 904)) +// VMULEUW — vector multiply even unsigned word (produces doublewords, POWER8) +#define VMULEUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 648)) +// VMULOSW — vector multiply odd signed word (produces doublewords, POWER8) +#define VMULOSW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 392)) +// VMULOUW — vector multiply odd unsigned word (produces doublewords, POWER8) +#define VMULOUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 136)) +// VMULUWM — vector multiply unsigned word modulo (POWER8) +#define VMULUWM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 137)) + +// --- Pack --- +// VPKUHUM — vector pack unsigned halfword unsigned modulo (halfwords->bytes) +#define VPKUHUM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 14)) +// VPKUWUM — vector pack unsigned word unsigned modulo (words->halfwords) +#define VPKUWUM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 78)) +// VPKSHSS — vector pack signed halfword signed saturate (halfwords->bytes) +#define VPKSHSS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 398)) +// VPKSWSS — vector pack signed word signed saturate (words->halfwords) +#define VPKSWSS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 462)) +// VPKSHUS — vector pack signed halfword unsigned saturate +#define VPKSHUS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 270)) +// VPKSWUS — vector pack signed word unsigned saturate +#define VPKSWUS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 334)) +// VPKSDSS — vector pack signed doubleword signed saturate (POWER8) +#define VPKSDSS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1486)) +// VPKSDUS — vector pack signed doubleword unsigned saturate (POWER8) +#define VPKSDUS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1358)) +// VPKUDUS — vector pack unsigned doubleword unsigned saturate (POWER8) +#define VPKUDUS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1230)) +// VPKUDUM — vector pack unsigned doubleword unsigned modulo (POWER8) +#define VPKUDUM(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1102)) + +// --- Unpack / Extend --- +// VUPKHSB — vector unpack high signed byte (bytes->halfwords, upper 8 bytes) +#define VUPKHSB(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 526)) +// VUPKHSH — vector unpack high signed halfword (halfwords->words) +#define VUPKHSH(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 590)) +// VUPKHSW — vector unpack high signed word (words->doublewords, POWER8) +#define VUPKHSW(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1614)) +// VUPKLSB — vector unpack low signed byte (bytes->halfwords, lower 8 bytes) +#define VUPKLSB(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 654)) +// VUPKLSH — vector unpack low signed halfword (halfwords->words) +#define VUPKLSH(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 718)) +// VUPKLSW — vector unpack low signed word (words->doublewords, POWER8) +#define VUPKLSW(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1742)) +// VEXTSB2W — vector extend sign byte to word (POWER9) +#define VEXTSB2W(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 16, Vrb, 1538)) +// VEXTSB2D — vector extend sign byte to doubleword (POWER9) +#define VEXTSB2D(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 24, Vrb, 1538)) +// VEXTSH2W — vector extend sign halfword to word (POWER9) +#define VEXTSH2W(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 17, Vrb, 1538)) +// VEXTSH2D — vector extend sign halfword to doubleword (POWER9) +#define VEXTSH2D(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 25, Vrb, 1538)) +// VEXTSW2D — vector extend sign word to doubleword (POWER9) +#define VEXTSW2D(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 26, Vrb, 1538)) + +// --- Merge (Interleave) --- +// VMRGHB — vector merge high byte (interleave upper 8 bytes) +#define VMRGHB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 12)) +// VMRGHH — vector merge high halfword (interleave upper 4 halfwords) +#define VMRGHH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 76)) +// VMRGHW — vector merge high word (interleave upper 2 words) +#define VMRGHW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 140)) +// VMRGLB — vector merge low byte (interleave lower 8 bytes) +#define VMRGLB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 268)) +// VMRGLH — vector merge low halfword (interleave lower 4 halfwords) +#define VMRGLH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 332)) +// VMRGLW — vector merge low word (interleave lower 2 words) +#define VMRGLW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 396)) + +// --- Shift --- +// VSLB — vector shift left byte +#define VSLB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 260)) +// VSLH — vector shift left halfword +#define VSLH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 324)) +// VSLW — vector shift left word +#define VSLW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 388)) +// VSLD — vector shift left doubleword (POWER8) +#define VSLD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1476)) +// VSRB — vector shift right byte +#define VSRB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 516)) +// VSRH — vector shift right halfword +#define VSRH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 580)) +// VSRW — vector shift right word +#define VSRW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 644)) +// VSRD — vector shift right doubleword (POWER8) +#define VSRD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1732)) +// VSRAB — vector shift right algebraic byte +#define VSRAB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 772)) +// VSRAH — vector shift right algebraic halfword +#define VSRAH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 836)) +// VSRAW — vector shift right algebraic word +#define VSRAW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 900)) +// VSRAD — vector shift right algebraic doubleword (POWER8) +#define VSRAD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 964)) + +// VSL — vector shift left (128-bit, by count in VRB bits 121:124) +#define VSL(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 452)) +// VSR — vector shift right (128-bit, by count in VRB bits 121:124) +#define VSR(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 708)) +// VSLO — vector shift left by octet (byte shift) +#define VSLO(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1036)) +// VSRO — vector shift right by octet (byte shift) +#define VSRO(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1100)) + +// --- Splat --- +// VSPLTB — vector splat byte: replicate byte VRB[UIMM] to all 16 bytes +#define VSPLTB(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 524)) +// VSPLTH — vector splat halfword: replicate halfword VRB[UIMM] to all 8 halfwords +#define VSPLTH(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x7, Vrb, 588)) +// VSPLTW — vector splat word: replicate word VRB[UIMM] to all 4 words +#define VSPLTW(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x3, Vrb, 652)) +// VSPLTISB — vector splat immediate signed byte: all 16 bytes = sign-extend(SIMM) +#define VSPLTISB(Vrt, simm) EMIT(VX_form_gen(Vrt, (simm) & 0x1F, 0, 780)) +// VSPLTISH — vector splat immediate signed halfword +#define VSPLTISH(Vrt, simm) EMIT(VX_form_gen(Vrt, (simm) & 0x1F, 0, 844)) +// VSPLTISW — vector splat immediate signed word +#define VSPLTISW(Vrt, simm) EMIT(VX_form_gen(Vrt, (simm) & 0x1F, 0, 908)) + +// --- Permute (VA-form, opcode 4) --- +// VPERM — vector permute: VRT[i] = (VRA||VRB)[VRC[i] & 0x1F] +#define VPERM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 43)) + +// VSLDOI — vector shift left double by octet immediate (VA-form) +// VRT = (VRA || VRB) << (SHB*8), take upper 128 bits +#define VSLDOI(Vrt, Vra, Vrb, shb) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, (shb) & 0xF, 44)) + +// VBPERMQ — vector bit permute quadword (result in VRT[dw0] bits 48:63 and VRT[dw1] bits 48:63) +#define VBPERMQ(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1356)) + +// --- Logical (VMX — use XXLAND/XXLOR/XXLXOR for VSX range instead) --- +// VAND — vector AND +#define VAND(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1028)) +// VOR — vector OR +#define VOR(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1156)) +// VXOR — vector XOR +#define VXOR(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1220)) +// VNOR — vector NOR +#define VNOR(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1284)) +// VANDC — vector AND with complement: VRT = VRA & ~VRB +#define VANDC(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1092)) +// VORC — vector OR with complement (POWER8): VRT = VRA | ~VRB +#define VORC(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1348)) +// VNAND — vector NAND (POWER8) +#define VNAND(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1412)) +// VEQV — vector equivalence / XNOR (POWER8) +#define VEQV(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1668)) + +// --- Absolute Value --- +// VABSDUB — vector absolute difference unsigned byte (POWER9) +#define VABSDUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1027)) +// VABSDUH — vector absolute difference unsigned halfword (POWER9) +#define VABSDUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1091)) +// VABSDUW — vector absolute difference unsigned word (POWER9) +#define VABSDUW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1155)) + +// --- Average --- +// VAVGUB — vector average unsigned byte +#define VAVGUB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1026)) +// VAVGUH — vector average unsigned halfword +#define VAVGUH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1090)) +// VAVGSB — vector average signed byte +#define VAVGSB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1282)) +// VAVGSH — vector average signed halfword +#define VAVGSH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1346)) + +// --- Sum Across --- +// VSUM4UBS — vector sum across partial (unsigned byte) +#define VSUM4UBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1672)) +// VSUM4SBS — vector sum across partial (signed byte) +#define VSUM4SBS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1800)) +// VSUM4SHS — vector sum across partial (signed halfword) +#define VSUM4SHS(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 1608)) + +// --- Multiply-Add/Sum (VA-form) --- +// VMSUMUBM — vector multiply-sum unsigned byte modulo +#define VMSUMUBM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 36)) +// VMSUMUHM — vector multiply-sum unsigned halfword modulo +#define VMSUMUHM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 38)) +// VMSUMSHS — vector multiply-sum signed halfword saturate +#define VMSUMSHS(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 41)) +// VMSUMSHM — vector multiply-sum signed halfword modulo +#define VMSUMSHM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 40)) +// VMSUMMBM — vector multiply-sum mixed-sign byte modulo +#define VMSUMMBM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 37)) +// VMHADDSHS — vector multiply-high-add signed halfword saturate +#define VMHADDSHS(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 32)) +// VMHRADDSHS — vector multiply-high-round-add signed halfword saturate +#define VMHRADDSHS(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 33)) +// VMLADDUHM — vector multiply-low-add unsigned halfword modulo +#define VMLADDUHM(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 34)) + +// --- Bit Select (VA-form) --- +// VSEL — vector select: VRT = (VRA & ~VRC) | (VRB & VRC) +#define VSEL(Vrt, Vra, Vrb, Vrc) EMIT(VA_form_gen(4, Vrt, Vra, Vrb, Vrc, 42)) + +// --- Count Leading Zeros --- +// VCLZB — vector count leading zeros byte (POWER8) +#define VCLZB(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1794)) +// VCLZH — vector count leading zeros halfword (POWER8) +#define VCLZH(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1858)) +// VCLZW — vector count leading zeros word (POWER8) +#define VCLZW(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1922)) +// VCLZD — vector count leading zeros doubleword (POWER8) +#define VCLZD(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1986)) + +// --- Population Count --- +// VPOPCNTB — vector population count byte (POWER8) +#define VPOPCNTB(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1795)) +// VPOPCNTH — vector population count halfword (POWER8) +#define VPOPCNTH(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1859)) +// VPOPCNTW — vector population count word (POWER8) +#define VPOPCNTW(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1923)) +// VPOPCNTD — vector population count doubleword (POWER8) +#define VPOPCNTD(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 0, Vrb, 1987)) + +// --- Rotate --- +// VRLB — vector rotate left byte +#define VRLB(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 4)) +// VRLH — vector rotate left halfword +#define VRLH(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 68)) +// VRLW — vector rotate left word +#define VRLW(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 132)) +// VRLD — vector rotate left doubleword (POWER8) +#define VRLD(Vrt, Vra, Vrb) EMIT(VX_form_gen(Vrt, Vra, Vrb, 196)) + +// --- Negative Multiply-Add (POWER9) --- +// VNEGW — vector negate word (POWER9) +#define VNEGW(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 6, Vrb, 1538)) +// VNEGD — vector negate doubleword (POWER9) +#define VNEGD(Vrt, Vrb) EMIT(VX_form_gen(Vrt, 7, Vrb, 1538)) + +// --- Byte Reverse --- +// XXBRW — VSX vector byte-reverse word (POWER9, XX2-form) +#define XXBRW(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 475)) +// XXBRD — VSX vector byte-reverse doubleword (POWER9, XX2-form) +#define XXBRD(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 503)) +// XXBRQ — VSX vector byte-reverse quadword (POWER9, XX2-form) +#define XXBRQ(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 507)) +// XXBRH — VSX vector byte-reverse halfword (POWER9, XX2-form) +#define XXBRH(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 471)) + +// --- VSX Vector FP Arithmetic (XX3-form, opcode 60) --- +// XVADDDP — vector add double-precision +#define XVADDDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 96)) +// XVSUBDP — vector subtract double-precision +#define XVSUBDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 104)) +// XVMULDP — vector multiply double-precision +#define XVMULDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 112)) +// XVDIVDP — vector divide double-precision +#define XVDIVDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 120)) +// XVSQRTDP — vector square root double-precision (XX2-form) +#define XVSQRTDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 203)) +// XVMINDP — vector minimum double-precision +#define XVMINDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 232)) +// XVMAXDP — vector maximum double-precision +#define XVMAXDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 224)) + +// XVADDSP — vector add single-precision +#define XVADDSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 64)) +// XVSUBSP — vector subtract single-precision +#define XVSUBSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 72)) +// XVMULSP — vector multiply single-precision +#define XVMULSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 80)) +// XVDIVSP — vector divide single-precision +#define XVDIVSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 88)) +// XVSQRTSP — vector square root single-precision (XX2-form) +#define XVSQRTSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 139)) +// XVRESP — vector reciprocal estimate single-precision (XX2-form) +#define XVRESP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 154)) +// XVRSQRTESP — vector reciprocal square root estimate single-precision (XX2-form) +#define XVRSQRTESP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 138)) +// XVMAXSP — vector maximum single-precision (XX3-form) +#define XVMAXSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 192)) +// XVMINSP — vector minimum single-precision (XX3-form) +#define XVMINSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 200)) +// XVCVSPUXWS — VSX vector convert 4 singles to 4 unsigned int32 +#define XVCVSPUXWS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 136)) +// XVABSSP — VSX vector absolute single +#define XVABSSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 409)) +// XVNEGSP — VSX vector negate single +#define XVNEGSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 441)) + +// --- VSX Vector Fused Multiply-Add (XX3-form, opcode 60) --- +// A-form: XT = XA * XT + XB (multiply target by operand, add other) +// M-form: XT = XA * XB + XT (multiply two operands, add to target) +// Single-precision (SP): +#define XVMADDASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 65)) // XT = XA*XT + XB +#define XVMADDMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 73)) // XT = XA*XB + XT +#define XVMSUBASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 81)) // XT = XA*XT - XB +#define XVMSUBMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 89)) // XT = XA*XB - XT +#define XVNMADDASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 193)) // XT = -(XA*XT + XB) +#define XVNMADDMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 201)) // XT = -(XA*XB + XT) +#define XVNMSUBASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 209)) // XT = -(XA*XT - XB) +#define XVNMSUBMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 217)) // XT = -(XA*XB - XT) +// Double-precision (DP): +#define XVMADDADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 97)) // XT = XA*XT + XB +#define XVMADDMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 105)) // XT = XA*XB + XT +#define XVMSUBADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 113)) // XT = XA*XT - XB +#define XVMSUBMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 121)) // XT = XA*XB - XT +#define XVNMADDADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 225)) // XT = -(XA*XT + XB) +#define XVNMADDMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 233)) // XT = -(XA*XB + XT) +#define XVNMSUBADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 241)) // XT = -(XA*XT - XB) +#define XVNMSUBMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 249)) // XT = -(XA*XB - XT) + +// --- VSX Scalar Fused Multiply-Add (XX3-form, opcode 60) --- +// Scalar FMA (double precision scalar in VSX dw0): +#define XSMADDADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 33)) // XT = XA*XT + XB +#define XSMADDMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 41)) // XT = XA*XB + XT +#define XSMSUBADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 49)) // XT = XA*XT - XB +#define XSMSUBMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 57)) // XT = XA*XB - XT +#define XSNMADDADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 161)) // XT = -(XA*XT + XB) +#define XSNMADDMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 169)) // XT = -(XA*XB + XT) +#define XSNMSUBADP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 177)) // XT = -(XA*XT - XB) +#define XSNMSUBMDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 185)) // XT = -(XA*XB - XT) + +// --- VSX Scalar Fused Multiply-Add, Single-Precision (XX3-form, opcode 60, ISA 2.07/POWER8+) --- +// These perform FMA with single-precision rounding (one round to SP), avoiding double-rounding. +// Operands are in DP format in VSX dw0 (use XSCVSPDPN to widen SP inputs first). +#define XSMADDASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 1)) // XT = XA*XT + XB +#define XSMADDMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 9)) // XT = XA*XB + XT +#define XSMSUBASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 17)) // XT = XA*XT - XB +#define XSMSUBMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 25)) // XT = XA*XB - XT +#define XSNMADDASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 129)) // XT = -(XA*XT + XB) +#define XSNMADDMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 137)) // XT = -(XA*XB + XT) +#define XSNMSUBASP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 145)) // XT = -(XA*XT - XB) +#define XSNMSUBMSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 153)) // XT = -(XA*XB - XT) + +// --- VSX Scalar FP Arithmetic (XX3-form, opcode 60) --- +// Single-precision (ISA 2.07/POWER8+, XX3-form, full vs0-vs63 range) +// XSADDSP — scalar add single-precision +#define XSADDSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 0)) +// XSSUBSP — scalar subtract single-precision +#define XSSUBSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 8)) +// XSMULSP — scalar multiply single-precision +#define XSMULSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 16)) +// XSDIVSP — scalar divide single-precision +#define XSDIVSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 24)) +// XSSQRTSP — scalar square root single-precision (XX2-form) +#define XSSQRTSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 11)) + +// Double-precision (XX3-form, full vs0-vs63 range) +// XSADDDP — scalar add double-precision +#define XSADDDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 32)) +// XSSUBDP — scalar subtract double-precision +#define XSSUBDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 40)) +// XSMULDP — scalar multiply double-precision +#define XSMULDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 48)) +// XSDIVDP — scalar divide double-precision +#define XSDIVDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 56)) +// XSSQRTDP — scalar square root double-precision (XX2-form) +#define XSSQRTDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 75)) +// XSMINDP — scalar minimum double-precision +#define XSMINDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 168)) +// XSMAXDP — scalar maximum double-precision +#define XSMAXDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 160)) + +// --- VSX Vector FP Compare (XX3-form) --- +// XVCMPEQDP — vector compare equal double-precision +#define XVCMPEQDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 99)) +// XVCMPGEDP — vector compare greater-or-equal double-precision +#define XVCMPGEDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 115)) +// XVCMPGTDP — vector compare greater-than double-precision +#define XVCMPGTDP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 107)) +// XVCMPEQSP — vector compare equal single-precision +#define XVCMPEQSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 67)) +// XVCMPGESP — vector compare greater-or-equal single-precision +#define XVCMPGESP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 83)) +// XVCMPGTSP — vector compare greater-than single-precision +#define XVCMPGTSP(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 75)) + +// --- Move to/from VSR (POWER8, X-form with TX/SX bit, opcode 31) --- +// XX1-form: like X-form but bit 0 is TX (bit 5 of 6-bit VSX register) instead of Rc +#define XX1_form_gen(opcd, rt6, ra, rb, xo) \ + ((uint32_t)(opcd) << 26 | (((rt6) & 0x1F)) << 21 | ((ra) & 0x1F) << 16 | ((rb) & 0x1F) << 11 | ((xo) & 0x3FF) << 1 | (((rt6) >> 5) & 1)) +// MFVSRD — move from VSR doubleword to GPR (supports vs0-vs63) +#define MFVSRD(Ra, Xs) EMIT(XX1_form_gen(31, Xs, Ra, 0, 51)) +// MTVSRD — move to VSR doubleword from GPR (supports vs0-vs63) +#define MTVSRD(Xt, Ra) EMIT(XX1_form_gen(31, Xt, Ra, 0, 179)) +// MFVSRWZ — move from VSR word (zero-extended) to GPR (supports vs0-vs63) +#define MFVSRWZ(Ra, Xs) EMIT(XX1_form_gen(31, Xs, Ra, 0, 115)) +// MTVSRWZ — move to VSR word (zero-extended) from GPR (supports vs0-vs63) +#define MTVSRWZ(Xt, Ra) EMIT(XX1_form_gen(31, Xt, Ra, 0, 243)) +// MTVSRDD — move to VSR double doubleword from 2 GPRs (POWER9, supports vs0-vs63) +// Xt = (RA || RB), 128 bits +#define MTVSRDD(Xt, Ra, Rb) EMIT(XX1_form_gen(31, Xt, Ra, Rb, 435)) +// MFVSRLD — move from VSR lower doubleword to GPR (POWER9, supports vs0-vs63) +#define MFVSRLD(Ra, Xs) EMIT(XX1_form_gen(31, Xs, Ra, 0, 307)) + +// --- VSX Select (XX4-form) --- +// XX4-form: OPCD(6) | T(5) | A(5) | B(5) | C(5) | XO(2) | CX(1) | AX(1) | BX(1) | TX(1) +#define XX4_form_gen(opcd, t, a, b, c, xo) \ + ((uint32_t)(opcd) << 26 | (((t) & 0x1F)) << 21 | (((a) & 0x1F)) << 16 | (((b) & 0x1F)) << 11 | (((c) & 0x1F)) << 6 | ((xo) & 0x3) << 4 | ((((c) >> 5) & 1)) << 3 | ((((a) >> 5) & 1)) << 2 | ((((b) >> 5) & 1)) << 1 | ((((t) >> 5) & 1))) +// XXSEL — VSX select: XT = (XA & ~XC) | (XB & XC) +#define XXSEL(Xt, Xa, Xb, Xc) EMIT(XX4_form_gen(60, Xt, Xa, Xb, Xc, 3)) + +// --- Vector Extract / Insert (POWER9, VX-form) --- +// VEXTRACTUB — vector extract unsigned byte to VSR +#define VEXTRACTUB(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 525)) +// VEXTRACTUH — vector extract unsigned halfword to VSR +#define VEXTRACTUH(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 589)) +// VEXTRACTUW — vector extract unsigned word to VSR +#define VEXTRACTUW(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 653)) +// VEXTRACTD — vector extract doubleword to VSR +#define VEXTRACTD(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 717)) +// VINSERTB — vector insert byte from VSR +#define VINSERTB(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 781)) +// VINSERTH — vector insert halfword from VSR +#define VINSERTH(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 845)) +// VINSERTW — vector insert word from VSR +#define VINSERTW(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 909)) +// VINSERTD — vector insert doubleword from VSR +#define VINSERTD(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0xF, Vrb, 973)) + +// --- Vector Convert (VX-form) --- +// VCFSX — vector convert from signed fixed-point word (to SP float) +#define VCFSX(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x1F, Vrb, 842)) +// VCFUX — vector convert from unsigned fixed-point word (to SP float) +#define VCFUX(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x1F, Vrb, 778)) +// VCTSXS — vector convert to signed fixed-point word saturate (from SP float) +#define VCTSXS(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x1F, Vrb, 970)) +// VCTUXS — vector convert to unsigned fixed-point word saturate (from SP float) +#define VCTUXS(Vrt, Vrb, uimm) EMIT(VX_form_gen(Vrt, (uimm) & 0x1F, Vrb, 906)) + +// --- MOVD/MOVQ helpers --- +// XXSPLTW — VSX splat word: replicate word XB[UIM2] to all 4 words (XX2-form) +// UIM2 is encoded in bits 16-17 (C-shift), XO = 164 +#define XXSPLTW(Xt, Xb, uim2) \ + EMIT(XX2_form_gen(60, Xt, Xb, 164) | (((uim2) & 0x3) << 16)) + +// XXSPLTIB — VSX splat immediate byte: replicate IMM8 to all 16 bytes (POWER9, X_RD6_IMM8-form) +// Encoding: [60 | XT[0:4] | 00 | IMM8 | 360 | XT[5]] +#define XXSPLTIB(Xt, imm8) \ + EMIT((uint32_t)(60) << 26 | (((Xt) & 0x1F) << 21) | (((imm8) & 0xFF) << 11) | (360 << 1) | (((Xt) >> 5) & 1)) + +// =========================================================================== +// Trap / debug +// =========================================================================== +// TWI — trap word immediate (D-form, opcode 3) +#define TWI(to, Ra, si) EMIT(D_form_gen(3, to, Ra, (si) & 0xFFFF)) +// TDI — trap doubleword immediate (D-form, opcode 2) +#define TDI(to, Ra, si) EMIT(D_form_gen(2, to, Ra, (si) & 0xFFFF)) +// TW — trap word (X-form, opcode 31, xo=4) +#define TW(to, Ra, Rb) EMIT(X_form_gen(31, to, Ra, Rb, 4, 0)) +// TD — trap doubleword (X-form, opcode 31, xo=68) +#define TD(to, Ra, Rb) EMIT(X_form_gen(31, to, Ra, Rb, 68, 0)) +// TRAP — unconditional trap (pseudo: tw 31, 0, 0) +#define TRAP() TW(31, 0, 0) + +// UDF — undefined instruction (for marking dead code, same as trap) +#define UDF(...) TRAP() + +// =========================================================================== +// Floating-point arithmetic (A-form) +// =========================================================================== +// A-form: OPCD(6) | FRT(5) | FRA(5) | FRB(5) | FRC(5) | XO(5) | Rc(1) +#define A_form_gen(opcd, frt, fra, frb, frc, xo, rc) \ + ((uint32_t)(opcd) << 26 | ((frt) & 0x1F) << 21 | ((fra) & 0x1F) << 16 | ((frb) & 0x1F) << 11 | ((frc) & 0x1F) << 6 | ((xo) & 0x1F) << 1 | ((rc) & 1)) + +// FADD — FRT = FRA + FRB (double) +#define FADD(Frt, Fra, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, 0, 21, 0)) +// FADDS — FRT = FRA + FRB (single) +#define FADDS(Frt, Fra, Frb) EMIT(A_form_gen(59, Frt, Fra, Frb, 0, 21, 0)) +// FSUB — FRT = FRA - FRB (double) +#define FSUB(Frt, Fra, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, 0, 20, 0)) +// FSUBS — FRT = FRA - FRB (single) +#define FSUBS(Frt, Fra, Frb) EMIT(A_form_gen(59, Frt, Fra, Frb, 0, 20, 0)) +// FMUL — FRT = FRA * FRC (double) NOTE: FRC is in the FRC field, FRB=0 +#define FMUL(Frt, Fra, Frc) EMIT(A_form_gen(63, Frt, Fra, 0, Frc, 25, 0)) +// FMULS — FRT = FRA * FRC (single) +#define FMULS(Frt, Fra, Frc) EMIT(A_form_gen(59, Frt, Fra, 0, Frc, 25, 0)) +// FDIV — FRT = FRA / FRB (double) +#define FDIV(Frt, Fra, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, 0, 18, 0)) +// FDIVS — FRT = FRA / FRB (single) +#define FDIVS(Frt, Fra, Frb) EMIT(A_form_gen(59, Frt, Fra, Frb, 0, 18, 0)) +// FSQRT — FRT = sqrt(FRB) (double) +#define FSQRT(Frt, Frb) EMIT(A_form_gen(63, Frt, 0, Frb, 0, 22, 0)) +// FSQRTS — FRT = sqrt(FRB) (single) +#define FSQRTS(Frt, Frb) EMIT(A_form_gen(59, Frt, 0, Frb, 0, 22, 0)) +// FMADD — FRT = FRA * FRC + FRB (double) +#define FMADD(Frt, Fra, Frc, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, Frc, 29, 0)) +// FMSUB — FRT = FRA * FRC - FRB (double) +#define FMSUB(Frt, Fra, Frc, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, Frc, 28, 0)) +// FNMADD — FRT = -(FRA * FRC + FRB) (double) +#define FNMADD(Frt, Fra, Frc, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, Frc, 31, 0)) +// FNMSUB — FRT = -(FRA * FRC - FRB) (double) +#define FNMSUB(Frt, Fra, Frc, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, Frc, 30, 0)) +// FSEL — FRT = (FRA >= 0) ? FRC : FRB (double) +#define FSEL(Frt, Fra, Frc, Frb) EMIT(A_form_gen(63, Frt, Fra, Frb, Frc, 23, 0)) + +// =========================================================================== +// Floating-point conversion / rounding (X-form, opcode 63) +// =========================================================================== +// FRSP — round to single precision: FRT = (float)FRB +#define FRSP(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 12, 0)) +// FCTID — convert to integer doubleword (current rounding mode) +#define FCTID(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 814, 0)) +// FCTIDZ — convert to integer doubleword with truncation (round toward zero) +#define FCTIDZ(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 815, 0)) +// FCTIW — convert to integer word (current rounding mode) +#define FCTIW(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 14, 0)) +// FCTIWZ — convert to integer word with truncation (round toward zero) +#define FCTIWZ(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 15, 0)) +// FCFID — convert from integer doubleword to double: FRT = (double)(int64_t)FRB +#define FCFID(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 846, 0)) +// FCFIDS — convert from integer doubleword to single: FRT = (float)(int64_t)FRB +#define FCFIDS(Frt, Frb) EMIT(X_form_gen(59, Frt, 0, Frb, 846, 0)) +// FCFIDU — convert from unsigned integer doubleword to double +#define FCFIDU(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 974, 0)) +// FCTIDU — convert to unsigned integer doubleword (current rounding) +#define FCTIDU(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 942, 0)) +// FCTIDUZ — convert to unsigned integer doubleword with truncation +#define FCTIDUZ(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 943, 0)) + +// FABS — FRT = |FRB| +#define FABS(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 264, 0)) +// FNABS — FRT = -|FRB| +#define FNABS(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 136, 0)) +// FNEG — FRT = -FRB +#define FNEG(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 40, 0)) +// FMR — FRT = FRB (move register) +#define FMR(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 72, 0)) +// FCPSGN — FRT = copysign(FRB, FRA) (POWER6+) +#define FCPSGN(Frt, Fra, Frb) EMIT(X_form_gen(63, Frt, Fra, Frb, 8, 0)) +// FRIZ — round to integer toward zero (POWER5+) +#define FRIZ(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 424, 0)) +// FRIN — round to integer toward nearest (POWER5+) +#define FRIN(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 392, 0)) +// FRIP — round to integer toward +infinity (POWER5+) +#define FRIP(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 456, 0)) +// FRIM — round to integer toward -infinity (POWER5+) +#define FRIM(Frt, Frb) EMIT(X_form_gen(63, Frt, 0, Frb, 488, 0)) + +// =========================================================================== +// Floating-point compare (X-form, opcode 63) +// =========================================================================== +// FCMPU — compare unordered, result in CR field +// X-form: 63 | BF(3)||00 | FRA(5) | FRB(5) | XO=0 | 0 +#define FCMPU(bf, Fra, Frb) EMIT(X_form_gen(63, ((bf) & 0x7) << 2, Fra, Frb, 0, 0)) +// FCMPO — compare ordered, result in CR field +#define FCMPO(bf, Fra, Frb) EMIT(X_form_gen(63, ((bf) & 0x7) << 2, Fra, Frb, 32, 0)) + +// =========================================================================== +// Floating-point status and control register (FPSCR) +// =========================================================================== +// MFFS — move from FPSCR: FRT gets the full 64-bit FPSCR value +// X-form: 63 | FRT(5) | 00000 | 00000 | 583 | 0 +#define MFFS(Frt) EMIT(X_form_gen(63, Frt, 0, 0, 583, 0)) +// MFFSd — MFFS and set CR1 +#define MFFSd(Frt) EMIT(X_form_gen(63, Frt, 0, 0, 583, 1)) + +// MTFSF — move to FPSCR fields: set FPSCR fields selected by FLM from FRB +// XFL-form: 63 | 0 | FLM(8) | 0 | FRB(5) | 711 | Rc +// FLM is an 8-bit mask, each bit selects one of 8 4-bit fields of FPSCR +#define MTFSF_gen(flm, frb, rc) \ + ((uint32_t)(63) << 26 | ((flm) & 0xFF) << 17 | ((frb) & 0x1F) << 11 | (711) << 1 | ((rc) & 1)) +#define MTFSF(flm, Frb) EMIT(MTFSF_gen(flm, Frb, 0)) +#define MTFSFd(flm, Frb) EMIT(MTFSF_gen(flm, Frb, 1)) + +// MTFSFI — move to FPSCR field immediate +// X-form variant: 63 | BF(3)||0||W(1) | 0000 | U(4)||0 | 134 | Rc +#define MTFSFI_gen(bf, u, rc) \ + ((uint32_t)(63) << 26 | ((bf) & 0x7) << 23 | (((u) & 0xF) << 1) << 11 | (134) << 1 | ((rc) & 1)) +#define MTFSFI(bf, u) EMIT(MTFSFI_gen(bf, u, 0)) +// MTFSB0 — clear FPSCR bit +#define MTFSB0(bt) EMIT(X_form_gen(63, bt, 0, 0, 70, 0)) +// MTFSB1 — set FPSCR bit +#define MTFSB1(bt) EMIT(X_form_gen(63, bt, 0, 0, 38, 0)) + +// =========================================================================== +// VSX instructions (XX3-form, XX2-form) +// =========================================================================== +// XX3-form: OPCD(6) | T(5) | A(5) | B(5) | XO(8) | AX(1) | BX(1) | TX(1) +// T/A/B are low 5 bits of 6-bit VSR index; TX/AX/BX are the high bits +#define XX3_form_gen(opcd, t, a, b, xo) \ + ((uint32_t)(opcd) << 26 | (((t) & 0x1F)) << 21 | (((a) & 0x1F)) << 16 | (((b) & 0x1F)) << 11 | ((xo) & 0xFF) << 3 | ((((a) >> 5) & 1)) << 2 | ((((b) >> 5) & 1)) << 1 | ((((t) >> 5) & 1))) + +// XXLOR — VSX logical OR: XT = XA | XB (128-bit) +// Also used as register move: XXLOR(xt, xa, xa) +#define XXLOR(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 146)) +// XXLXOR — VSX logical XOR: XT = XA ^ XB (128-bit) +// Also used to zero a register: XXLXOR(xt, xt, xt) +#define XXLXOR(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 154)) +// XXLAND — VSX logical AND: XT = XA & XB (128-bit) +#define XXLAND(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 130)) +// XXLANDC — VSX logical AND with complement: XT = XA & ~XB +#define XXLANDC(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 138)) +// XXLNOR — VSX logical NOR: XT = ~(XA | XB) +#define XXLNOR(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 162)) +// XXLEQV — VSX logical equivalence: XT = ~(XA ^ XB) +#define XXLEQV(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 186)) +// XXLORC — VSX logical OR with complement: XT = XA | ~XB +#define XXLORC(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 170)) +// XXLNAND — VSX logical NAND: XT = ~(XA & XB) +#define XXLNAND(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 178)) + +// XXMRGHD — VSX merge high doubleword +#define XXMRGHD(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 10)) +// XXMRGLD — VSX merge low doubleword +#define XXMRGLD(Xt, Xa, Xb) EMIT(XX3_form_gen(60, Xt, Xa, Xb, 42)) +// XXPERMDI — VSX permute doubleword immediate +// XX3-form with DM(2) in bits 6-7 of the 8-bit xo field +#define XXPERMDI(Xt, Xa, Xb, dm) \ + EMIT(XX3_form_gen(60, Xt, Xa, Xb, (10 | (((dm) & 0x3) << 5)))) + +// XX2-form: OPCD(6) | T(5) | 00000 | B(5) | XO(9) | BX(1) | TX(1) +#define XX2_form_gen(opcd, t, b, xo) \ + ((uint32_t)(opcd) << 26 | (((t) & 0x1F)) << 21 | (((b) & 0x1F)) << 11 | ((xo) & 0x1FF) << 2 | ((((b) >> 5) & 1)) << 1 | ((((t) >> 5) & 1))) + +// XSCVDPSP — VSX scalar convert double to single (result in word 0) +#define XSCVDPSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 265)) +// XSCVDPSPN — VSX scalar convert double to single non-signaling (result in word 0, no exceptions) +#define XSCVDPSPN(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 267)) +// XSCVSPDP — VSX scalar convert single (word 0) to double +#define XSCVSPDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 329)) +// XSCVSPDPN — VSX scalar convert single (word 0) to double non-signaling (no exceptions) +#define XSCVSPDPN(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 331)) +// XSCVSXDDP — VSX scalar convert signed int64 to double +#define XSCVSXDDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 376)) +// XSCVSXDSP — VSX scalar convert signed int64 to single +#define XSCVSXDSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 312)) +// XSCVDPSXDS — VSX scalar convert double to signed int64 (truncation) +#define XSCVDPSXDS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 344)) +// XSCVDPSXWS — VSX scalar convert double to signed int32 (truncation) +#define XSCVDPSXWS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 88)) +// XSCVDPUXDS — VSX scalar convert double to unsigned int64 (truncation) +#define XSCVDPUXDS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 328)) +// XSCVUXDDP — VSX scalar convert unsigned int64 to double +#define XSCVUXDDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 360)) + +// XVCVDPSP — VSX vector convert 2 doubles to 2 singles (packed) +#define XVCVDPSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 393)) +// XVCVSPDP — VSX vector convert 2 singles to 2 doubles +#define XVCVSPDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 457)) +// XVCVSXDDP — VSX vector convert 2 signed int64 to 2 doubles +#define XVCVSXDDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 504)) +// XVCVDPSXDS — VSX vector convert 2 doubles to 2 signed int64 +#define XVCVDPSXDS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 472)) +// XVCVDPSXWS — VSX vector convert 2 doubles to 2 signed int32 (results in odd words) +#define XVCVDPSXWS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 216)) +// XVCVSXWDP — VSX vector convert 2 signed int32 (odd words) to 2 doubles +#define XVCVSXWDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 248)) +// XVCVSPSXWS — VSX vector convert 4 singles to 4 signed int32 +#define XVCVSPSXWS(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 152)) +// XVCVSXWSP — VSX vector convert 4 signed int32 to 4 singles +#define XVCVSXWSP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 184)) +// XVRDPIZ — VSX vector round double to integer toward zero +#define XVRDPIZ(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 217)) +// XVRDPI — VSX vector round double to integer nearest (ties to even) +#define XVRDPI(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 201)) +// XVRDPIC — VSX vector round double to integer using current rounding mode +#define XVRDPIC(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 235)) +// XVRDPIM — VSX vector round double to integer toward -infinity (floor) +#define XVRDPIM(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 249)) +// XVRDPIP — VSX vector round double to integer toward +infinity (ceil) +#define XVRDPIP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 233)) +// XVRSPIZ — VSX vector round single to integer toward zero +#define XVRSPIZ(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 153)) +// XVRSPI — VSX vector round single to integer nearest (ties to even) +#define XVRSPI(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 137)) +// XVRSPIC — VSX vector round single to integer using current rounding mode +#define XVRSPIC(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 171)) +// XVRSPIM — VSX vector round single to integer toward -infinity (floor) +#define XVRSPIM(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 185)) +// XVRSPIP — VSX vector round single to integer toward +infinity (ceil) +#define XVRSPIP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 169)) +// XSRDPI — VSX scalar round double to integer nearest (ties to even) +#define XSRDPI(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 73)) +// XSRDPIC — VSX scalar round double to integer using current rounding mode +#define XSRDPIC(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 107)) +// XSRDPIM — VSX scalar round double to integer toward -infinity (floor) +#define XSRDPIM(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 121)) +// XSRDPIP — VSX scalar round double to integer toward +infinity (ceil) +#define XSRDPIP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 105)) +// XSRDPIZ — VSX scalar round double to integer toward zero (truncation) +#define XSRDPIZ(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 89)) + +// XVABSDP — VSX vector absolute double +#define XVABSDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 473)) +// XVNEGDP — VSX vector negate double +#define XVNEGDP(Xt, Xb) EMIT(XX2_form_gen(60, Xt, Xb, 505)) + +// =========================================================================== +// VSX scalar FP compare (XX3-form, opcode 60) +// =========================================================================== +// XSCMPUDP — VSX scalar compare unordered double, result in CR field BF +#define XSCMPUDP(bf, Xa, Xb) EMIT(XX3_form_gen(60, ((bf) & 0x7) << 2, Xa, Xb, 35)) +// XSCMPODP — VSX scalar compare ordered double, result in CR field BF +#define XSCMPODP(bf, Xa, Xb) EMIT(XX3_form_gen(60, ((bf) & 0x7) << 2, Xa, Xb, 43)) + +// =========================================================================== +// ISEL — Integer Select (X-form, opcode 31) +// =========================================================================== +// ISEL(RT, RA, RB, BC): if CR bit BC is set, RT=RA; else RT=RB +// If RA=0, the value 0 is used instead of GPR[0] +#define ISEL(Rt, Ra, Rb, BC) EMIT(X_form_gen(31, Rt, Ra, Rb, (((BC) & 0x1F) << 5) | 15, 0)) + +// =========================================================================== +// Convenience macros for the dynarec framework +// =========================================================================== + +// Move register (already defined as MR/MV above) +// MOVx_REG — alias for compatibility with ARM64 macro names +#define MOVx_REG(Rd, Rs) MR(Rd, Rs) + +// =========================================================================== +// PUSH / POP x86-64 emulation stack macros +// =========================================================================== +// POP1 — pop 64-bit value from x86 RSP +#define POP1(reg) \ + do { \ + LD(reg, 0, xRSP); \ + if (reg != xRSP) ADDI(xRSP, xRSP, 8); \ + } while (0) + +// PUSH1 — push 64-bit value to x86 RSP +#define PUSH1(reg) \ + do { \ + STD(reg, -8, xRSP); \ + ADDI(xRSP, xRSP, -8); \ + } while (0) + +// POP1_32 — pop 32-bit value from x86 RSP +#define POP1_32(reg) \ + do { \ + LWZ(reg, 0, xRSP); \ + if (reg != xRSP) ADDI(xRSP, xRSP, 4); \ + } while (0) + +// PUSH1_32 — push 32-bit value to x86 RSP +#define PUSH1_32(reg) \ + do { \ + STW(reg, -4, xRSP); \ + ADDI(xRSP, xRSP, -4); \ + } while (0) + +// POP1_16 — pop 16-bit value from x86 RSP +#define POP1_16(reg) \ + do { \ + LHZ(reg, 0, xRSP); \ + if (reg != xRSP) ADDI(xRSP, xRSP, 2); \ + } while (0) + +// PUSH1_16 — push 16-bit value to x86 RSP +#define PUSH1_16(reg) \ + do { \ + STH(reg, -2, xRSP); \ + ADDI(xRSP, xRSP, -2); \ + } while (0) + +// POP1z — pop 32 or 64 bits depending on rex.is32bits +#define POP1z(reg) \ + do { \ + if (rex.is32bits) { \ + POP1_32(reg); \ + } else { \ + POP1(reg); \ + } \ + } while (0) + +// PUSH1z — push 32 or 64 bits depending on rex.is32bits +#define PUSH1z(reg) \ + do { \ + if (rex.is32bits) { \ + PUSH1_32(reg); \ + } else { \ + PUSH1(reg); \ + } \ + } while (0) + +// =========================================================================== +// Branch convenience macros +// =========================================================================== +// BNEZ_safe — branch if register is non-zero, with near/far fallback +// PPC64LE conditional branches have ±32KB range, unconditional ±32MB. +// imm is byte offset from the start of this macro sequence to the target. +// Always emits exactly 3 instructions (CMPDI + cond_branch + NOP/B). +// Near case: CMPDI(+0) BNE(+4) NOP(+8) +// Far case: CMPDI(+0) BEQ_skip(+4) B(+8) +#define BNEZ_safe(rj, imm) \ + do { \ + CMPDI(rj, 0); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BNE((imm) - 4); \ + NOP(); \ + } else { \ + BEQ(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +// BEZ_safe — branch if register is zero, with near/far fallback +#define BEZ_safe(rj, imm) \ + do { \ + CMPDI(rj, 0); \ + if ((imm) > -0x7000 && (imm) < 0x7000) { \ + BEQ((imm) - 4); \ + NOP(); \ + } else { \ + BNE(2 * 4); \ + B((imm) - 8); \ + } \ + } while (0) + +// Aliases for GOCOND macro compatibility (B##EQZ##_safe, B##NEZ##_safe) +#define BEQZ_safe(rj, imm) BEZ_safe(rj, imm) + +// IMARK / GETIP / MARKLOCK etc. will be defined in the pass headers + +#endif // __PPC64LE_EMITTER_H__ diff --git a/src/dynarec/ppc64le/ppc64le_epilog.S b/src/dynarec/ppc64le/ppc64le_epilog.S new file mode 100644 index 0000000000..90dd2938cc --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_epilog.S @@ -0,0 +1,98 @@ +// PPC64LE epilog for dynarec +// Store x86 register state back to emu struct, +// restore callee-saved registers, return to caller. +// Called with: +// Emu (r31) = pointer to x64emu_t +// +// Must store back all x86 registers including RIP. + +#define ASM_MAPPING +#include "ppc64le_mapping.h" + +.text +.align 4 + +.global ppc64le_epilog +.type ppc64le_epilog, @function +ppc64le_epilog: + // Store x86 registers back to emu struct + // Emu (r31) points to x64emu_t + std RAX, (8 * 0)(Emu) // r14 + std RCX, (8 * 1)(Emu) // r15 + std RDX, (8 * 2)(Emu) // r16 + std RBX, (8 * 3)(Emu) // r17 + std RSP, (8 * 4)(Emu) // r18 + std RBP, (8 * 5)(Emu) // r19 + std RSI, (8 * 6)(Emu) // r20 + std RDI, (8 * 7)(Emu) // r21 + std R8, (8 * 8)(Emu) // r22 + std R9, (8 * 9)(Emu) // r23 + std R10, (8 * 10)(Emu) // r24 + std R11, (8 * 11)(Emu) // r25 + std R12, (8 * 12)(Emu) // r26 + std R13, (8 * 13)(Emu) // r27 + std R14, (8 * 14)(Emu) // r28 + std R15, (8 * 15)(Emu) // r29 + + // Store flags + std Flags, (8 * 16)(Emu) // r30 + + // Store RIP (r9 = xRIP, must contain current IP) + std RIP, (8 * 17)(Emu) // r9 + + // Restore stack pointer from emu->xSPSave + // xSPSave stores the frame base SP saved by the prolog + ld 1, 808(Emu) + + // Restore old xSPSave for nested dynarec calls + ld 0, 192(1) + std 0, 808(Emu) + + // Restore TOC + ld 2, 24(1) + + // Restore CR + lwz 0, 8(1) + mtcrf 0xFF, 0 + + // Restore callee-saved VMX registers + // vr20 (= vs52): scratch register index 28 + lxv 52, 208(1) + // vr24-vr31 (= vs56-vs63): MMX registers 0-7 + lxv 56, 224(1) + lxv 57, 240(1) + lxv 58, 256(1) + lxv 59, 272(1) + lxv 60, 288(1) + lxv 61, 304(1) + lxv 62, 320(1) + lxv 63, 336(1) + + // Restore callee-saved GPRs + ld 14, 32(1) + ld 15, 40(1) + ld 16, 48(1) + ld 17, 56(1) + ld 18, 64(1) + ld 19, 72(1) + ld 20, 80(1) + ld 21, 88(1) + ld 22, 96(1) + ld 23, 104(1) + ld 24, 112(1) + ld 25, 120(1) + ld 26, 128(1) + ld 27, 136(1) + ld 28, 144(1) + ld 29, 152(1) + ld 30, 160(1) + ld 31, 168(1) + + // Deallocate frame + addi 1, 1, 352 + + // Restore LR and return + ld 0, 16(1) + mtlr 0 + blr +.size ppc64le_epilog, .-ppc64le_epilog diff --git a/src/dynarec/ppc64le/ppc64le_lock.S b/src/dynarec/ppc64le/ppc64le_lock.S new file mode 100644 index 0000000000..0e1c2b3ab3 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_lock.S @@ -0,0 +1,757 @@ +// PPC64LE lock helper functions +// Atomic read/write operations using Load-and-Reserve / Store-Conditional +// PPC64LE uses: lbarx/stbcx. (byte), lharx/sthcx. (halfword), +// lwarx/stwcx. (word), ldarx/stdcx. (doubleword) +// All with lwsync/sync barriers for memory ordering. + +.text +.align 4 + +.global ppc64le_lock_read_b +.global ppc64le_lock_write_b +.global ppc64le_lock_read_h +.global ppc64le_lock_write_h +.global ppc64le_lock_read_d +.global ppc64le_lock_write_d +.global ppc64le_lock_read_dd +.global ppc64le_lock_write_dd +.global ppc64le_lock_xchg_dd +.global ppc64le_lock_xchg_d +.global ppc64le_lock_xchg_h +.global ppc64le_lock_xchg_b +.global ppc64le_lock_storeifnull +.global ppc64le_lock_storeifnull_d +.global ppc64le_lock_storeifref +.global ppc64le_lock_storeifref2 +.global ppc64le_lock_storeifref_d +.global ppc64le_lock_storeifref2_d +.global ppc64le_lock_decifnot0b +.global ppc64le_lock_storeb +.global ppc64le_lock_incif0 +.global ppc64le_lock_decifnot0 +.global ppc64le_lock_store +.global ppc64le_lock_store_dd +.global ppc64le_lock_get_b +.global ppc64le_lock_get_d +.global ppc64le_lock_get_dd +.global ppc64le_crc + +// ============================================================================ +// Byte operations +// ============================================================================ +.type ppc64le_lock_read_b, @function +ppc64le_lock_read_b: + // address is r3, return is r3 + lwsync + lbarx 3, 0, 3 + blr +.size ppc64le_lock_read_b, .-ppc64le_lock_read_b + +.type ppc64le_lock_write_b, @function +ppc64le_lock_write_b: + // address is r3, value is r4, return is r3 (0=success, 1=fail) + mr 5, 3 + stbcx. 4, 0, 5 + mfcr 3 + rlwinm 3, 3, 3, 31, 31 // extract CR0[EQ]: 1 if success + xori 3, 3, 1 // invert: 0 = success, 1 = fail + lwsync + blr +.size ppc64le_lock_write_b, .-ppc64le_lock_write_b + +// ============================================================================ +// Halfword operations +// ============================================================================ +.type ppc64le_lock_read_h, @function +ppc64le_lock_read_h: + // address is r3, return is r3 + lwsync + lharx 3, 0, 3 + blr +.size ppc64le_lock_read_h, .-ppc64le_lock_read_h + +.type ppc64le_lock_write_h, @function +ppc64le_lock_write_h: + // address is r3, value is r4, return is r3 + mr 5, 3 + sthcx. 4, 0, 5 + mfcr 3 + rlwinm 3, 3, 3, 31, 31 + xori 3, 3, 1 + lwsync + blr +.size ppc64le_lock_write_h, .-ppc64le_lock_write_h + +// ============================================================================ +// Word (32-bit) operations +// ============================================================================ +.type ppc64le_lock_read_d, @function +ppc64le_lock_read_d: + // address is r3, return is r3 + lwsync + lwarx 3, 0, 3 + blr +.size ppc64le_lock_read_d, .-ppc64le_lock_read_d + +.type ppc64le_lock_write_d, @function +ppc64le_lock_write_d: + // address is r3, value is r4, return is r3 + mr 5, 3 + stwcx. 4, 0, 5 + mfcr 3 + rlwinm 3, 3, 3, 31, 31 + xori 3, 3, 1 + lwsync + blr +.size ppc64le_lock_write_d, .-ppc64le_lock_write_d + +// ============================================================================ +// Doubleword (64-bit) operations +// ============================================================================ +.type ppc64le_lock_read_dd, @function +ppc64le_lock_read_dd: + // address is r3, return is r3 + lwsync + ldarx 3, 0, 3 + blr +.size ppc64le_lock_read_dd, .-ppc64le_lock_read_dd + +.type ppc64le_lock_write_dd, @function +ppc64le_lock_write_dd: + // address is r3, value is r4, return is r3 + mr 5, 3 + stdcx. 4, 0, 5 + mfcr 3 + rlwinm 3, 3, 3, 31, 31 + xori 3, 3, 1 + lwsync + blr +.size ppc64le_lock_write_dd, .-ppc64le_lock_write_dd + +// ============================================================================ +// Exchange operations (atomic swap) +// ============================================================================ +.type ppc64le_lock_xchg_dd, @function +ppc64le_lock_xchg_dd: + // address is r3, value is r4, return old value in r3 + lwsync +1: + ldarx 5, 0, 3 + stdcx. 4, 0, 3 + bne 1b + lwsync + mr 3, 5 + blr +.size ppc64le_lock_xchg_dd, .-ppc64le_lock_xchg_dd + +.type ppc64le_lock_xchg_d, @function +ppc64le_lock_xchg_d: + // address is r3, value is r4, return old value in r3 + lwsync +1: + lwarx 5, 0, 3 + stwcx. 4, 0, 3 + bne 1b + lwsync + mr 3, 5 + blr +.size ppc64le_lock_xchg_d, .-ppc64le_lock_xchg_d + +.type ppc64le_lock_xchg_h, @function +ppc64le_lock_xchg_h: + // address is r3, value is r4, return old value in r3 + lwsync +1: + lharx 5, 0, 3 + sthcx. 4, 0, 3 + bne 1b + lwsync + mr 3, 5 + blr +.size ppc64le_lock_xchg_h, .-ppc64le_lock_xchg_h + +.type ppc64le_lock_xchg_b, @function +ppc64le_lock_xchg_b: + // address is r3, value is r4, return old value in r3 + lwsync +1: + lbarx 5, 0, 3 + stbcx. 4, 0, 3 + bne 1b + lwsync + mr 3, 5 + blr +.size ppc64le_lock_xchg_b, .-ppc64le_lock_xchg_b + +// ============================================================================ +// Conditional store operations +// ============================================================================ +.type ppc64le_lock_storeifnull, @function +ppc64le_lock_storeifnull: + // address is r3, value is r4 + // store r4 to [r3] only if [r3] is 0. return old [r3] value + lwsync +1: + ldarx 5, 0, 3 + cmpdi 5, 0 + bne 2f + stdcx. 4, 0, 3 + bne 1b +2: + lwsync + mr 3, 5 + blr +.size ppc64le_lock_storeifnull, .-ppc64le_lock_storeifnull + +.type ppc64le_lock_storeifnull_d, @function +ppc64le_lock_storeifnull_d: + // address is r3, value is r4 + // store r4 (word) to [r3] only if [r3] is 0. return old [r3] value + lwsync +1: + lwarx 5, 0, 3 + cmpwi 5, 0 + bne 2f + stwcx. 4, 0, 3 + bne 1b +2: + lwsync + mr 3, 5 + blr +.size ppc64le_lock_storeifnull_d, .-ppc64le_lock_storeifnull_d + +.type ppc64le_lock_storeifref, @function +ppc64le_lock_storeifref: + // address is r3, value is r4, ref is r5 + // store r4 to [r3] only if [r3] == r5. return new [r3] value (r4 or old) + lwsync +1: + ldarx 6, 0, 3 + cmpd 5, 6 + bne 2f + stdcx. 4, 0, 3 + bne 1b + mr 3, 4 + blr +2: + mr 3, 6 + blr +.size ppc64le_lock_storeifref, .-ppc64le_lock_storeifref + +.type ppc64le_lock_storeifref2, @function +ppc64le_lock_storeifref2: + // address is r3, value is r4, ref is r5 + // store r4 to [r3] only if [r3] == r5. return old [r3] value + lwsync +1: + ldarx 6, 0, 3 + cmpd 5, 6 + bne 2f + stdcx. 4, 0, 3 + bne 1b +2: + mr 3, 6 + blr +.size ppc64le_lock_storeifref2, .-ppc64le_lock_storeifref2 + +.type ppc64le_lock_storeifref_d, @function +ppc64le_lock_storeifref_d: + // address is r3, value is r4 (word), ref is r5 (word) + // store r4 to [r3] only if [r3] == r5. return new [r3] value + lwsync +1: + lwarx 6, 0, 3 + cmpw 5, 6 + bne 2f + stwcx. 4, 0, 3 + bne 1b + mr 3, 4 + blr +2: + mr 3, 6 + blr +.size ppc64le_lock_storeifref_d, .-ppc64le_lock_storeifref_d + +.type ppc64le_lock_storeifref2_d, @function +ppc64le_lock_storeifref2_d: + // address is r3, value is r4 (word), ref is r5 (word) + // store r4 to [r3] only if [r3] == r5. return old [r3] value + lwsync +1: + lwarx 6, 0, 3 + cmpw 5, 6 + bne 2f + stwcx. 4, 0, 3 + bne 1b +2: + mr 3, 6 + blr +.size ppc64le_lock_storeifref2_d, .-ppc64le_lock_storeifref2_d + +// ============================================================================ +// Misc lock operations +// ============================================================================ +.type ppc64le_lock_decifnot0b, @function +ppc64le_lock_decifnot0b: + // address is r3, decrement byte at [r3] if not 0 + lwsync +1: + lbarx 4, 0, 3 + cmpwi 4, 0 + beq 2f + addi 4, 4, -1 + stbcx. 4, 0, 3 + bne 1b +2: + blr +.size ppc64le_lock_decifnot0b, .-ppc64le_lock_decifnot0b + +.type ppc64le_lock_storeb, @function +ppc64le_lock_storeb: + // address is r3, value is r4 + stb 4, 0(3) + lwsync + blr +.size ppc64le_lock_storeb, .-ppc64le_lock_storeb + +.type ppc64le_lock_decifnot0, @function +ppc64le_lock_decifnot0: + // address is r3, return old value in r3 + lwsync +1: + lwarx 4, 0, 3 + cmpwi 4, 0 + beq 2f + addi 5, 4, -1 + stwcx. 5, 0, 3 + bne 1b +2: + mr 3, 4 + blr +.size ppc64le_lock_decifnot0, .-ppc64le_lock_decifnot0 + +.type ppc64le_lock_incif0, @function +ppc64le_lock_incif0: + // address is r3, increment word at [r3] if 0. return old value in r3 + lwsync +1: + lwarx 4, 0, 3 + cmpwi 4, 0 + bne 2f + addi 5, 4, 1 + stwcx. 5, 0, 3 + bne 1b +2: + mr 3, 4 + blr +.size ppc64le_lock_incif0, .-ppc64le_lock_incif0 + +.type ppc64le_lock_store, @function +ppc64le_lock_store: + // address is r3, value is r4 (word) + stw 4, 0(3) + lwsync + blr +.size ppc64le_lock_store, .-ppc64le_lock_store + +.type ppc64le_lock_store_dd, @function +ppc64le_lock_store_dd: + // address is r3, value is r4 (doubleword) + std 4, 0(3) + lwsync + blr +.size ppc64le_lock_store_dd, .-ppc64le_lock_store_dd + +.type ppc64le_lock_get_b, @function +ppc64le_lock_get_b: + // address is r3, return byte in r3 + lbarx 3, 0, 3 + blr +.size ppc64le_lock_get_b, .-ppc64le_lock_get_b + +.type ppc64le_lock_get_d, @function +ppc64le_lock_get_d: + // address is r3, return word in r3 + lwarx 3, 0, 3 + blr +.size ppc64le_lock_get_d, .-ppc64le_lock_get_d + +.type ppc64le_lock_get_dd, @function +ppc64le_lock_get_dd: + // address is r3, return doubleword in r3 + ldarx 3, 0, 3 + blr +.size ppc64le_lock_get_dd, .-ppc64le_lock_get_dd + +// ============================================================================ +// CRC32 — slicing-by-4 software implementation +// PPC64LE has no hardware CRC32 instruction, so we use 4 lookup tables (4KB). +// Uses polynomial 0x1EDC6F41 (CRC-32C / Castagnoli), bit-reversed as 0x82F63B78. +// Processes 4 bytes per iteration with 4 independent table lookups (good ILP), +// with byte-at-a-time tail using tab0. +// ============================================================================ +.type ppc64le_crc, @function +ppc64le_crc: + // r3 = address, r4 = length (bytes) + // return CRC in r3 + // Registers: + // r3 = crc (return value) + // r4 = remaining length + // r5 = address pointer + // r6 = tab0 base + // r7 = tab1 base (tab0 + 1024) + // r8 = tab2 base (tab0 + 2048) + // r9 = tab3 base (tab0 + 3072) + // r10 = temp (loaded word) + // r11 = temp (table lookup result) + // r12 = temp (table lookup result) + // r0 = temp (table lookup result) + mr 5, 3 // address in r5 + li 3, 0 // crc = 0 + cmpwi 4, 0 + beqlr // return 0 if length == 0 + + // Load table base address (PC-relative via bcl/mflr trick) + mflr 10 // save LR in r10 + bcl 20, 31, .+4 // branch-and-link to next insn (sets LR = here+4) +.Lcrc_here: + mflr 6 // r6 = address of .Lcrc_here + mtlr 10 // restore LR + addis 6, 6, (.Lcrc32c_tab0 - .Lcrc_here)@ha + addi 6, 6, (.Lcrc32c_tab0 - .Lcrc_here)@l + + // Set up table base pointers: tab1 = tab0+1024, tab2 = tab0+2048, tab3 = tab0+3072 + addi 7, 6, 1024 // r7 = tab1 + addi 8, 6, 2048 // r8 = tab2 + addi 9, 6, 3072 // r9 = tab3 + + // Main loop: slicing-by-4, process 4 bytes per iteration + cmplwi 4, 4 + blt .Lcrc_tail + +.Lcrc_4byte_loop: + // Load 4 bytes as a little-endian word and XOR with crc + lwz 10, 0(5) // r10 = *(uint32_t*)addr (LE on LE machine) + xor 10, 3, 10 // r10 = crc ^ word + + // 4 independent table lookups — good ILP on superscalar POWER9 + // tab3[(r10 >> 0) & 0xFF] + clrldi 11, 10, 56 // r11 = r10 & 0xFF (byte 0) + sldi 11, 11, 2 // r11 *= 4 + lwzx 11, 9, 11 // r11 = tab3[byte0] + + // tab2[(r10 >> 8) & 0xFF] + srwi 12, 10, 8 + clrldi 12, 12, 56 // r12 = (r10 >> 8) & 0xFF (byte 1) + sldi 12, 12, 2 // r12 *= 4 + lwzx 12, 8, 12 // r12 = tab2[byte1] + + // tab1[(r10 >> 16) & 0xFF] + srwi 0, 10, 16 + clrldi 0, 0, 56 // r0 = (r10 >> 16) & 0xFF (byte 2) + sldi 0, 0, 2 // r0 *= 4 + lwzx 0, 7, 0 // r0 = tab1[byte2] + + // tab0[(r10 >> 24) & 0xFF] + srwi 10, 10, 24 // r10 = (r10 >> 24) & 0xFF (byte 3) + sldi 10, 10, 2 // r10 *= 4 + lwzx 10, 6, 10 // r10 = tab0[byte3] + + // XOR all 4 results together + xor 3, 11, 12 // crc = tab3[b0] ^ tab2[b1] + xor 3, 3, 0 // crc ^= tab1[b2] + xor 3, 3, 10 // crc ^= tab0[b3] + + addi 5, 5, 4 + addi 4, 4, -4 + cmplwi 4, 4 + bge .Lcrc_4byte_loop + +.Lcrc_tail: + // Process remaining 0-3 bytes one at a time using tab0 + cmpwi 4, 0 + beqlr + +.Lcrc_tail_loop: + lbz 10, 0(5) + xor 11, 3, 10 + clrldi 11, 11, 56 + sldi 11, 11, 2 + lwzx 11, 6, 11 + srwi 3, 3, 8 + xor 3, 3, 11 + addi 5, 5, 1 + addi 4, 4, -1 + cmpwi 4, 0 + bne .Lcrc_tail_loop + + blr +.size ppc64le_crc, .-ppc64le_crc + +// CRC32C slicing-by-4 lookup tables (polynomial 0x82F63B78) +// 4 tables x 256 entries x 4 bytes = 4KB total +.section .rodata +.align 4 +.Lcrc32c_tab0: + .long 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4 + .long 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB + .long 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B + .long 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24 + .long 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B + .long 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384 + .long 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54 + .long 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B + .long 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A + .long 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35 + .long 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5 + .long 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA + .long 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45 + .long 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A + .long 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A + .long 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595 + .long 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48 + .long 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957 + .long 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687 + .long 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198 + .long 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927 + .long 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38 + .long 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8 + .long 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7 + .long 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096 + .long 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789 + .long 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859 + .long 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46 + .long 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9 + .long 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6 + .long 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36 + .long 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829 + .long 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C + .long 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93 + .long 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043 + .long 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C + .long 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3 + .long 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC + .long 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C + .long 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033 + .long 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652 + .long 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D + .long 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D + .long 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982 + .long 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D + .long 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622 + .long 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2 + .long 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED + .long 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530 + .long 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F + .long 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF + .long 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0 + .long 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F + .long 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540 + .long 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90 + .long 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F + .long 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE + .long 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1 + .long 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321 + .long 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E + .long 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81 + .long 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E + .long 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E + .long 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351 + +.Lcrc32c_tab1: + .long 0x00000000, 0x13A29877, 0x274530EE, 0x34E7A899 + .long 0x4E8A61DC, 0x5D28F9AB, 0x69CF5132, 0x7A6DC945 + .long 0x9D14C3B8, 0x8EB65BCF, 0xBA51F356, 0xA9F36B21 + .long 0xD39EA264, 0xC03C3A13, 0xF4DB928A, 0xE7790AFD + .long 0x3FC5F181, 0x2C6769F6, 0x1880C16F, 0x0B225918 + .long 0x714F905D, 0x62ED082A, 0x560AA0B3, 0x45A838C4 + .long 0xA2D13239, 0xB173AA4E, 0x859402D7, 0x96369AA0 + .long 0xEC5B53E5, 0xFFF9CB92, 0xCB1E630B, 0xD8BCFB7C + .long 0x7F8BE302, 0x6C297B75, 0x58CED3EC, 0x4B6C4B9B + .long 0x310182DE, 0x22A31AA9, 0x1644B230, 0x05E62A47 + .long 0xE29F20BA, 0xF13DB8CD, 0xC5DA1054, 0xD6788823 + .long 0xAC154166, 0xBFB7D911, 0x8B507188, 0x98F2E9FF + .long 0x404E1283, 0x53EC8AF4, 0x670B226D, 0x74A9BA1A + .long 0x0EC4735F, 0x1D66EB28, 0x298143B1, 0x3A23DBC6 + .long 0xDD5AD13B, 0xCEF8494C, 0xFA1FE1D5, 0xE9BD79A2 + .long 0x93D0B0E7, 0x80722890, 0xB4958009, 0xA737187E + .long 0xFF17C604, 0xECB55E73, 0xD852F6EA, 0xCBF06E9D + .long 0xB19DA7D8, 0xA23F3FAF, 0x96D89736, 0x857A0F41 + .long 0x620305BC, 0x71A19DCB, 0x45463552, 0x56E4AD25 + .long 0x2C896460, 0x3F2BFC17, 0x0BCC548E, 0x186ECCF9 + .long 0xC0D23785, 0xD370AFF2, 0xE797076B, 0xF4359F1C + .long 0x8E585659, 0x9DFACE2E, 0xA91D66B7, 0xBABFFEC0 + .long 0x5DC6F43D, 0x4E646C4A, 0x7A83C4D3, 0x69215CA4 + .long 0x134C95E1, 0x00EE0D96, 0x3409A50F, 0x27AB3D78 + .long 0x809C2506, 0x933EBD71, 0xA7D915E8, 0xB47B8D9F + .long 0xCE1644DA, 0xDDB4DCAD, 0xE9537434, 0xFAF1EC43 + .long 0x1D88E6BE, 0x0E2A7EC9, 0x3ACDD650, 0x296F4E27 + .long 0x53028762, 0x40A01F15, 0x7447B78C, 0x67E52FFB + .long 0xBF59D487, 0xACFB4CF0, 0x981CE469, 0x8BBE7C1E + .long 0xF1D3B55B, 0xE2712D2C, 0xD69685B5, 0xC5341DC2 + .long 0x224D173F, 0x31EF8F48, 0x050827D1, 0x16AABFA6 + .long 0x6CC776E3, 0x7F65EE94, 0x4B82460D, 0x5820DE7A + .long 0xFBC3FAF9, 0xE861628E, 0xDC86CA17, 0xCF245260 + .long 0xB5499B25, 0xA6EB0352, 0x920CABCB, 0x81AE33BC + .long 0x66D73941, 0x7575A136, 0x419209AF, 0x523091D8 + .long 0x285D589D, 0x3BFFC0EA, 0x0F186873, 0x1CBAF004 + .long 0xC4060B78, 0xD7A4930F, 0xE3433B96, 0xF0E1A3E1 + .long 0x8A8C6AA4, 0x992EF2D3, 0xADC95A4A, 0xBE6BC23D + .long 0x5912C8C0, 0x4AB050B7, 0x7E57F82E, 0x6DF56059 + .long 0x1798A91C, 0x043A316B, 0x30DD99F2, 0x237F0185 + .long 0x844819FB, 0x97EA818C, 0xA30D2915, 0xB0AFB162 + .long 0xCAC27827, 0xD960E050, 0xED8748C9, 0xFE25D0BE + .long 0x195CDA43, 0x0AFE4234, 0x3E19EAAD, 0x2DBB72DA + .long 0x57D6BB9F, 0x447423E8, 0x70938B71, 0x63311306 + .long 0xBB8DE87A, 0xA82F700D, 0x9CC8D894, 0x8F6A40E3 + .long 0xF50789A6, 0xE6A511D1, 0xD242B948, 0xC1E0213F + .long 0x26992BC2, 0x353BB3B5, 0x01DC1B2C, 0x127E835B + .long 0x68134A1E, 0x7BB1D269, 0x4F567AF0, 0x5CF4E287 + .long 0x04D43CFD, 0x1776A48A, 0x23910C13, 0x30339464 + .long 0x4A5E5D21, 0x59FCC556, 0x6D1B6DCF, 0x7EB9F5B8 + .long 0x99C0FF45, 0x8A626732, 0xBE85CFAB, 0xAD2757DC + .long 0xD74A9E99, 0xC4E806EE, 0xF00FAE77, 0xE3AD3600 + .long 0x3B11CD7C, 0x28B3550B, 0x1C54FD92, 0x0FF665E5 + .long 0x759BACA0, 0x663934D7, 0x52DE9C4E, 0x417C0439 + .long 0xA6050EC4, 0xB5A796B3, 0x81403E2A, 0x92E2A65D + .long 0xE88F6F18, 0xFB2DF76F, 0xCFCA5FF6, 0xDC68C781 + .long 0x7B5FDFFF, 0x68FD4788, 0x5C1AEF11, 0x4FB87766 + .long 0x35D5BE23, 0x26772654, 0x12908ECD, 0x013216BA + .long 0xE64B1C47, 0xF5E98430, 0xC10E2CA9, 0xD2ACB4DE + .long 0xA8C17D9B, 0xBB63E5EC, 0x8F844D75, 0x9C26D502 + .long 0x449A2E7E, 0x5738B609, 0x63DF1E90, 0x707D86E7 + .long 0x0A104FA2, 0x19B2D7D5, 0x2D557F4C, 0x3EF7E73B + .long 0xD98EEDC6, 0xCA2C75B1, 0xFECBDD28, 0xED69455F + .long 0x97048C1A, 0x84A6146D, 0xB041BCF4, 0xA3E32483 + +.Lcrc32c_tab2: + .long 0x00000000, 0xA541927E, 0x4F6F520D, 0xEA2EC073 + .long 0x9EDEA41A, 0x3B9F3664, 0xD1B1F617, 0x74F06469 + .long 0x38513EC5, 0x9D10ACBB, 0x773E6CC8, 0xD27FFEB6 + .long 0xA68F9ADF, 0x03CE08A1, 0xE9E0C8D2, 0x4CA15AAC + .long 0x70A27D8A, 0xD5E3EFF4, 0x3FCD2F87, 0x9A8CBDF9 + .long 0xEE7CD990, 0x4B3D4BEE, 0xA1138B9D, 0x045219E3 + .long 0x48F3434F, 0xEDB2D131, 0x079C1142, 0xA2DD833C + .long 0xD62DE755, 0x736C752B, 0x9942B558, 0x3C032726 + .long 0xE144FB14, 0x4405696A, 0xAE2BA919, 0x0B6A3B67 + .long 0x7F9A5F0E, 0xDADBCD70, 0x30F50D03, 0x95B49F7D + .long 0xD915C5D1, 0x7C5457AF, 0x967A97DC, 0x333B05A2 + .long 0x47CB61CB, 0xE28AF3B5, 0x08A433C6, 0xADE5A1B8 + .long 0x91E6869E, 0x34A714E0, 0xDE89D493, 0x7BC846ED + .long 0x0F382284, 0xAA79B0FA, 0x40577089, 0xE516E2F7 + .long 0xA9B7B85B, 0x0CF62A25, 0xE6D8EA56, 0x43997828 + .long 0x37691C41, 0x92288E3F, 0x78064E4C, 0xDD47DC32 + .long 0xC76580D9, 0x622412A7, 0x880AD2D4, 0x2D4B40AA + .long 0x59BB24C3, 0xFCFAB6BD, 0x16D476CE, 0xB395E4B0 + .long 0xFF34BE1C, 0x5A752C62, 0xB05BEC11, 0x151A7E6F + .long 0x61EA1A06, 0xC4AB8878, 0x2E85480B, 0x8BC4DA75 + .long 0xB7C7FD53, 0x12866F2D, 0xF8A8AF5E, 0x5DE93D20 + .long 0x29195949, 0x8C58CB37, 0x66760B44, 0xC337993A + .long 0x8F96C396, 0x2AD751E8, 0xC0F9919B, 0x65B803E5 + .long 0x1148678C, 0xB409F5F2, 0x5E273581, 0xFB66A7FF + .long 0x26217BCD, 0x8360E9B3, 0x694E29C0, 0xCC0FBBBE + .long 0xB8FFDFD7, 0x1DBE4DA9, 0xF7908DDA, 0x52D11FA4 + .long 0x1E704508, 0xBB31D776, 0x511F1705, 0xF45E857B + .long 0x80AEE112, 0x25EF736C, 0xCFC1B31F, 0x6A802161 + .long 0x56830647, 0xF3C29439, 0x19EC544A, 0xBCADC634 + .long 0xC85DA25D, 0x6D1C3023, 0x8732F050, 0x2273622E + .long 0x6ED23882, 0xCB93AAFC, 0x21BD6A8F, 0x84FCF8F1 + .long 0xF00C9C98, 0x554D0EE6, 0xBF63CE95, 0x1A225CEB + .long 0x8B277743, 0x2E66E53D, 0xC448254E, 0x6109B730 + .long 0x15F9D359, 0xB0B84127, 0x5A968154, 0xFFD7132A + .long 0xB3764986, 0x1637DBF8, 0xFC191B8B, 0x595889F5 + .long 0x2DA8ED9C, 0x88E97FE2, 0x62C7BF91, 0xC7862DEF + .long 0xFB850AC9, 0x5EC498B7, 0xB4EA58C4, 0x11ABCABA + .long 0x655BAED3, 0xC01A3CAD, 0x2A34FCDE, 0x8F756EA0 + .long 0xC3D4340C, 0x6695A672, 0x8CBB6601, 0x29FAF47F + .long 0x5D0A9016, 0xF84B0268, 0x1265C21B, 0xB7245065 + .long 0x6A638C57, 0xCF221E29, 0x250CDE5A, 0x804D4C24 + .long 0xF4BD284D, 0x51FCBA33, 0xBBD27A40, 0x1E93E83E + .long 0x5232B292, 0xF77320EC, 0x1D5DE09F, 0xB81C72E1 + .long 0xCCEC1688, 0x69AD84F6, 0x83834485, 0x26C2D6FB + .long 0x1AC1F1DD, 0xBF8063A3, 0x55AEA3D0, 0xF0EF31AE + .long 0x841F55C7, 0x215EC7B9, 0xCB7007CA, 0x6E3195B4 + .long 0x2290CF18, 0x87D15D66, 0x6DFF9D15, 0xC8BE0F6B + .long 0xBC4E6B02, 0x190FF97C, 0xF321390F, 0x5660AB71 + .long 0x4C42F79A, 0xE90365E4, 0x032DA597, 0xA66C37E9 + .long 0xD29C5380, 0x77DDC1FE, 0x9DF3018D, 0x38B293F3 + .long 0x7413C95F, 0xD1525B21, 0x3B7C9B52, 0x9E3D092C + .long 0xEACD6D45, 0x4F8CFF3B, 0xA5A23F48, 0x00E3AD36 + .long 0x3CE08A10, 0x99A1186E, 0x738FD81D, 0xD6CE4A63 + .long 0xA23E2E0A, 0x077FBC74, 0xED517C07, 0x4810EE79 + .long 0x04B1B4D5, 0xA1F026AB, 0x4BDEE6D8, 0xEE9F74A6 + .long 0x9A6F10CF, 0x3F2E82B1, 0xD50042C2, 0x7041D0BC + .long 0xAD060C8E, 0x08479EF0, 0xE2695E83, 0x4728CCFD + .long 0x33D8A894, 0x96993AEA, 0x7CB7FA99, 0xD9F668E7 + .long 0x9557324B, 0x3016A035, 0xDA386046, 0x7F79F238 + .long 0x0B899651, 0xAEC8042F, 0x44E6C45C, 0xE1A75622 + .long 0xDDA47104, 0x78E5E37A, 0x92CB2309, 0x378AB177 + .long 0x437AD51E, 0xE63B4760, 0x0C158713, 0xA954156D + .long 0xE5F54FC1, 0x40B4DDBF, 0xAA9A1DCC, 0x0FDB8FB2 + .long 0x7B2BEBDB, 0xDE6A79A5, 0x3444B9D6, 0x91052BA8 + +.Lcrc32c_tab3: + .long 0x00000000, 0xDD45AAB8, 0xBF672381, 0x62228939 + .long 0x7B2231F3, 0xA6679B4B, 0xC4451272, 0x1900B8CA + .long 0xF64463E6, 0x2B01C95E, 0x49234067, 0x9466EADF + .long 0x8D665215, 0x5023F8AD, 0x32017194, 0xEF44DB2C + .long 0xE964B13D, 0x34211B85, 0x560392BC, 0x8B463804 + .long 0x924680CE, 0x4F032A76, 0x2D21A34F, 0xF06409F7 + .long 0x1F20D2DB, 0xC2657863, 0xA047F15A, 0x7D025BE2 + .long 0x6402E328, 0xB9474990, 0xDB65C0A9, 0x06206A11 + .long 0xD725148B, 0x0A60BE33, 0x6842370A, 0xB5079DB2 + .long 0xAC072578, 0x71428FC0, 0x136006F9, 0xCE25AC41 + .long 0x2161776D, 0xFC24DDD5, 0x9E0654EC, 0x4343FE54 + .long 0x5A43469E, 0x8706EC26, 0xE524651F, 0x3861CFA7 + .long 0x3E41A5B6, 0xE3040F0E, 0x81268637, 0x5C632C8F + .long 0x45639445, 0x98263EFD, 0xFA04B7C4, 0x27411D7C + .long 0xC805C650, 0x15406CE8, 0x7762E5D1, 0xAA274F69 + .long 0xB327F7A3, 0x6E625D1B, 0x0C40D422, 0xD1057E9A + .long 0xABA65FE7, 0x76E3F55F, 0x14C17C66, 0xC984D6DE + .long 0xD0846E14, 0x0DC1C4AC, 0x6FE34D95, 0xB2A6E72D + .long 0x5DE23C01, 0x80A796B9, 0xE2851F80, 0x3FC0B538 + .long 0x26C00DF2, 0xFB85A74A, 0x99A72E73, 0x44E284CB + .long 0x42C2EEDA, 0x9F874462, 0xFDA5CD5B, 0x20E067E3 + .long 0x39E0DF29, 0xE4A57591, 0x8687FCA8, 0x5BC25610 + .long 0xB4868D3C, 0x69C32784, 0x0BE1AEBD, 0xD6A40405 + .long 0xCFA4BCCF, 0x12E11677, 0x70C39F4E, 0xAD8635F6 + .long 0x7C834B6C, 0xA1C6E1D4, 0xC3E468ED, 0x1EA1C255 + .long 0x07A17A9F, 0xDAE4D027, 0xB8C6591E, 0x6583F3A6 + .long 0x8AC7288A, 0x57828232, 0x35A00B0B, 0xE8E5A1B3 + .long 0xF1E51979, 0x2CA0B3C1, 0x4E823AF8, 0x93C79040 + .long 0x95E7FA51, 0x48A250E9, 0x2A80D9D0, 0xF7C57368 + .long 0xEEC5CBA2, 0x3380611A, 0x51A2E823, 0x8CE7429B + .long 0x63A399B7, 0xBEE6330F, 0xDCC4BA36, 0x0181108E + .long 0x1881A844, 0xC5C402FC, 0xA7E68BC5, 0x7AA3217D + .long 0x52A0C93F, 0x8FE56387, 0xEDC7EABE, 0x30824006 + .long 0x2982F8CC, 0xF4C75274, 0x96E5DB4D, 0x4BA071F5 + .long 0xA4E4AAD9, 0x79A10061, 0x1B838958, 0xC6C623E0 + .long 0xDFC69B2A, 0x02833192, 0x60A1B8AB, 0xBDE41213 + .long 0xBBC47802, 0x6681D2BA, 0x04A35B83, 0xD9E6F13B + .long 0xC0E649F1, 0x1DA3E349, 0x7F816A70, 0xA2C4C0C8 + .long 0x4D801BE4, 0x90C5B15C, 0xF2E73865, 0x2FA292DD + .long 0x36A22A17, 0xEBE780AF, 0x89C50996, 0x5480A32E + .long 0x8585DDB4, 0x58C0770C, 0x3AE2FE35, 0xE7A7548D + .long 0xFEA7EC47, 0x23E246FF, 0x41C0CFC6, 0x9C85657E + .long 0x73C1BE52, 0xAE8414EA, 0xCCA69DD3, 0x11E3376B + .long 0x08E38FA1, 0xD5A62519, 0xB784AC20, 0x6AC10698 + .long 0x6CE16C89, 0xB1A4C631, 0xD3864F08, 0x0EC3E5B0 + .long 0x17C35D7A, 0xCA86F7C2, 0xA8A47EFB, 0x75E1D443 + .long 0x9AA50F6F, 0x47E0A5D7, 0x25C22CEE, 0xF8878656 + .long 0xE1873E9C, 0x3CC29424, 0x5EE01D1D, 0x83A5B7A5 + .long 0xF90696D8, 0x24433C60, 0x4661B559, 0x9B241FE1 + .long 0x8224A72B, 0x5F610D93, 0x3D4384AA, 0xE0062E12 + .long 0x0F42F53E, 0xD2075F86, 0xB025D6BF, 0x6D607C07 + .long 0x7460C4CD, 0xA9256E75, 0xCB07E74C, 0x16424DF4 + .long 0x106227E5, 0xCD278D5D, 0xAF050464, 0x7240AEDC + .long 0x6B401616, 0xB605BCAE, 0xD4273597, 0x09629F2F + .long 0xE6264403, 0x3B63EEBB, 0x59416782, 0x8404CD3A + .long 0x9D0475F0, 0x4041DF48, 0x22635671, 0xFF26FCC9 + .long 0x2E238253, 0xF36628EB, 0x9144A1D2, 0x4C010B6A + .long 0x5501B3A0, 0x88441918, 0xEA669021, 0x37233A99 + .long 0xD867E1B5, 0x05224B0D, 0x6700C234, 0xBA45688C + .long 0xA345D046, 0x7E007AFE, 0x1C22F3C7, 0xC167597F + .long 0xC747336E, 0x1A0299D6, 0x782010EF, 0xA565BA57 + .long 0xBC65029D, 0x6120A825, 0x0302211C, 0xDE478BA4 + .long 0x31035088, 0xEC46FA30, 0x8E647309, 0x5321D9B1 + .long 0x4A21617B, 0x9764CBC3, 0xF54642FA, 0x2803E842 diff --git a/src/dynarec/ppc64le/ppc64le_lock.h b/src/dynarec/ppc64le/ppc64le_lock.h new file mode 100644 index 0000000000..e0daf88e73 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_lock.h @@ -0,0 +1,82 @@ +#ifndef __PPC64LE_LOCK__H__ +#define __PPC64LE_LOCK__H__ +#include + +// LBARX of ADDR +extern uint8_t ppc64le_lock_read_b(void* addr); +// STBCX. of ADDR, return 0 if ok, 1 if not +extern int ppc64le_lock_write_b(void* addr, uint8_t val); + +// LHARX of ADDR +extern uint16_t ppc64le_lock_read_h(void* addr); +// STHCX. of ADDR, return 0 if ok, 1 if not +extern int ppc64le_lock_write_h(void* addr, uint16_t val); + +// LWARX of ADDR +extern uint32_t ppc64le_lock_read_d(void* addr); +// STWCX. of ADDR, return 0 if ok, 1 if not +extern int ppc64le_lock_write_d(void* addr, uint32_t val); + +// LDARX of ADDR +extern uint64_t ppc64le_lock_read_dd(void* addr); +// STDCX. of ADDR, return 0 if ok, 1 if not +extern int ppc64le_lock_write_dd(void* addr, uint64_t val); + +// Atomically exchange value at [p] with val, return old p +extern uintptr_t ppc64le_lock_xchg_dd(void* p, uintptr_t val); + +// Atomically exchange value at [p] with val, return old p +extern uint32_t ppc64le_lock_xchg_d(void* p, uint32_t val); + +// Atomically exchange value at [p] with val, return old p +extern uint32_t ppc64le_lock_xchg_h(void* p, uint32_t val); + +// Atomically exchange value at [p] with val, return old p +extern uint32_t ppc64le_lock_xchg_b(void* p, uint32_t val); + +// Atomically store value to [p] only if [p] is NULL. Return old [p] value +extern uint32_t ppc64le_lock_storeifnull_d(void*p, uint32_t val); + +// Atomically store value to [p] only if [p] is NULL. Return old [p] value +extern void* ppc64le_lock_storeifnull(void*p, void* val); + +// Atomically store value to [p] only if [p] is ref. Return new [p] value (so val or old) +extern void* ppc64le_lock_storeifref(void*p, void* val, void* ref); + +// Atomically store value to [p] only if [p] is ref. Return old [p] value (so val or old) +extern void* ppc64le_lock_storeifref2(void*p, void* val, void* ref); + +// Atomically store value to [p] only if [p] is ref. Return new [p] value (so val or old) +extern uint32_t ppc64le_lock_storeifref_d(void*p, uint32_t val, uint32_t ref); + +// Atomically store value to [p] only if [p] is ref. Return old [p] value (so ref or old) +extern uint32_t ppc64le_lock_storeifref2_d(void*p, uint32_t val, uint32_t ref); + +// decrement atomically the byte at [p] (but only if p not 0) +extern void ppc64le_lock_decifnot0b(void*p); + +// atomic store (with memory barrier) +extern void ppc64le_lock_storeb(void*p, uint8_t b); + +// increment atomically the int at [p] only if it was 0. Return the old value of [p] +extern int ppc64le_lock_incif0(void*p); + +// decrement atomically the int at [p] (but only if p not 0) +extern int ppc64le_lock_decifnot0(void*p); + +// atomic store (with memory barrier) +extern void ppc64le_lock_store(void*p, uint32_t v); + +// atomic store (with memory barrier) +extern void ppc64le_lock_store_dd(void*p, uint64_t v); + +// atomic get (with memory barrier) +extern uint8_t ppc64le_lock_get_b(void*p); + +// atomic get (with memory barrier) +extern uint32_t ppc64le_lock_get_d(void*p); + +// atomic get (with memory barrier) +extern void* ppc64le_lock_get_dd(void*p); + +#endif //__PPC64LE_LOCK__H__ diff --git a/src/dynarec/ppc64le/ppc64le_mapping.h b/src/dynarec/ppc64le/ppc64le_mapping.h new file mode 100644 index 0000000000..cff50a1b07 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_mapping.h @@ -0,0 +1,172 @@ +#ifndef __PPC64LE_MAPPING_H__ +#define __PPC64LE_MAPPING_H__ + +// PPC64LE Register Mapping Scheme (ELFv2 ABI) +/***************************************************************************************** +reg name mapping native description Box64 description saver +****************************************************************************************** +r0 - - Special (literal 0 in D-form) N/A - +r1 sp native sp Stack pointer N/A Callee +r2 toc native toc TOC pointer (reserved) N/A - +r3 a0 x1 Argument/return value Scratch Caller +r4 a1 x2 Argument Scratch Caller +r5 a2 x3 Argument Scratch Caller +r6 a3 x4 Argument Scratch Caller +r7 a4 x5 Argument Scratch Caller +r8 a5 x6 Argument Scratch Caller +r9 a6 x7 Argument Scratch Caller +r10 a7 - Argument Scratch (env ptr) Caller +r11 - - Scratch (env ptr / plt) PLT scratch Caller +r12 - - Scratch (func entry ptr) scratch (caller-saved) Caller +r13 - - TLS pointer (reserved) N/A - +r14 s0 RAX Saved register - Callee +r15 s1 RCX Saved register - Callee +r16 s2 RDX Saved register - Callee +r17 s3 RBX Saved register - Callee +r18 s4 RSP Saved register - Callee +r19 s5 RBP Saved register - Callee +r20 s6 RSI Saved register - Callee +r21 s7 RDI Saved register - Callee +r22 s8 R8 Saved register - Callee +r23 s9 R9 Saved register - Callee +r24 s10 R10 Saved register - Callee +r25 s11 R11 Saved register - Callee +r26 s12 R12 Saved register - Callee +r27 s13 R13 Saved register - Callee +r28 s14 R14 Saved register - Callee +r29 s15 R15 Saved register - Callee +r30 s16 FLAGS Saved register x86 Flags Callee +r31 s17 xEmu Saved register The Emu struct Callee + +SIMD / VMX / VSX: +vr0-vr19 Volatile (caller-saved) - scratch/temp +vr20-vr31 Non-volatile (callee-saved) - persistent XMM cache +vs0-vs31 Overlap with FPR f0-f31 (upper halves) +vs32-vs63 Overlap with VMX vr0-vr31 + +Note: In little-endian mode, SIMD element ordering matches x86 — minimal swizzle needed. +******************************************************************************************/ + +#ifndef ASM_MAPPING + +#include + +// x86 Register mapping (to PPC64LE GPRs) +// Using callee-saved r14-r29 for x86 registers +#define xRAX 14 +#define xRCX 15 +#define xRDX 16 +#define xRBX 17 +#define xRSP 18 +#define xRBP 19 +#define xRSI 20 +#define xRDI 21 +#define xR8 22 +#define xR9 23 +#define xR10 24 +#define xR11 25 +#define xR12 26 +#define xR13 27 +#define xR14 28 +#define xR15 29 +#define xFlags 30 +#define xRIP 9 // use r9 (caller-saved, loaded at block entry) +// xSavedSP is stored in emu->xSPSave (offset 808), not in a register. +// r12 is caller-saved and gets clobbered by native calls, so it cannot +// hold xSavedSP across BCTRL. See RV64 for the same approach. + +// convert a x86 register to native according to the register mapping +#define TO_NAT(A) (((uint8_t[]) { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 })[(A)]) +#define IS_GPR(A) (((uint8_t[]) { \ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ + 0, 0 \ +})[(A)]) + +// scratch registers (caller-saved GPRs) +#define x1 3 +#define x2 4 +#define x3 5 +#define x4 6 +#define x5 7 +#define x6 8 +#define x7 10 + +#define x87pc 11 // for x87 precision control, can be scratch otherwise + +// emu is r31 +#define xEmu 31 +// PPC64LE LR is in SPR (not a GPR), but we use r0 for BLR patterns +#define xLR 0 // r0 is special: used as literal 0 in D-form, but holds LR after mflr +// PPC64LE SP is r1 +#define xSP 1 +// PPC64LE has no zero register; r0 acts as 0 only in specific instruction forms +// We define xZR as 0 for compatibility, but it ONLY works in D-form load/store base +#define xZR 0 + +// PPC64LE ABI registers +#define A0 3 +#define A1 4 +#define A2 5 +#define A3 6 +#define A4 7 +#define A5 8 +#define A6 9 +#define A7 10 + +// PPC64LE Condition Register fields (CR0-CR7) +// Each CR field has 4 bits: LT, GT, EQ, SO +#define CR_LT 0 // bit 0 of a CR field: Less Than / FP Less Than +#define CR_GT 1 // bit 1 of a CR field: Greater Than / FP Greater Than +#define CR_EQ 2 // bit 2 of a CR field: Equal / FP Equal +#define CR_SO 3 // bit 3 of a CR field: Summary Overflow / FP Unordered + +// CR field numbers (for use with condition branches) +#define CR0 0 +#define CR1 1 +#define CR2 2 +#define CR3 3 +#define CR4 4 +#define CR5 5 +#define CR6 6 +#define CR7 7 + +// Branch condition encodings (BO field, 5 bits) +// Simplified forms for conditional branches +#define BO_TRUE 12 // 01100 - branch if condition bit is true +#define BO_FALSE 4 // 00100 - branch if condition bit is false +#define BO_ALWAYS 20 // 10100 - branch always (unconditional) + +// BI field helpers: CR field * 4 + bit within field +#define BI(cr, bit) ((cr) * 4 + (bit)) + +#else + +// Assembly-time register mapping +// x86 Register mapping for .S files +// PPC64LE GAS needs numeric register operands (no 'r' prefix in instructions) +// but some assemblers require it - we'll use numeric here +#define RAX 14 +#define RCX 15 +#define RDX 16 +#define RBX 17 +#define RSP 18 +#define RBP 19 +#define RSI 20 +#define RDI 21 +#define R8 22 +#define R9 23 +#define R10 24 +#define R11 25 +#define R12 26 +#define R13 27 +#define R14 28 +#define R15 29 +#define Flags 30 +#define RIP 9 +#define Emu 31 +// SavedSP is no longer a register — stored in emu->xSPSave (offset 808) + +#endif // ASM_MAPPING + +#endif // __PPC64LE_MAPPING_H__ diff --git a/src/dynarec/ppc64le/ppc64le_next.S b/src/dynarec/ppc64le/ppc64le_next.S new file mode 100644 index 0000000000..fcecb06505 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_next.S @@ -0,0 +1,76 @@ +// PPC64LE next linker for dynarec +// Called when a dynarec block needs to jump to the next block. +// Saves volatile state, calls LinkNext to resolve the target, +// then branches to the resolved address. +// +// Called with: +// Emu (r31) = pointer to x64emu_t +// RIP (r9) = target x86 IP +// LR = "from" address (for linking) + +#define ASM_MAPPING +#include "ppc64le_mapping.h" + +.text +.align 4 + +.extern LinkNext + +.global ppc64le_next +.type ppc64le_next, @function + + // NULL pointer before ppc64le_next, for getDB + .8byte 0 +ppc64le_next: + // Save volatile registers that the dynarec uses + // We need to preserve: r3-r10 (args/scratch used by dynarec) + // and RIP (r9), plus the LR (return "from" address) + mflr 0 + + // Allocate save area on stack + stdu 1, -128(1) + + // Save registers + std 0, 16(1) // LR (the "from" address) + std 3, 32(1) // x1 / A0 + std 4, 40(1) // x2 / A1 + std 5, 48(1) // x3 / A2 + std 6, 56(1) // x4 / A3 + std 7, 64(1) // x5 / A4 + std 8, 72(1) // x6 / A5 + std 9, 80(1) // xRIP - also save to allow change in LinkNext + std 10, 88(1) // x7 / A7 + + // Call LinkNext(emu, ip, from, &rip_on_stack) + mr 3, Emu // arg0 = emu (r31) + mr 4, RIP // arg1 = xRIP (r9) + ld 5, 16(1) // arg2 = "from" (saved LR) + addi 6, 1, 80 // arg3 = address of saved RIP on stack + + // Restore TOC for calling C code + // We need to load the TOC for LinkNext - use the saved TOC + // Note: In ELFv2, r12 must point to the function entry for local calls + // For external calls via PLT, the linker handles it + bl LinkNext + nop // TOC restore slot (linker fills if needed) + + // Preserve return value (jump target) in r12 + mr 12, 3 + + // Restore registers + ld 3, 32(1) + ld 4, 40(1) + ld 5, 48(1) + ld 6, 56(1) + ld 7, 64(1) + ld 8, 72(1) + ld 9, 80(1) // RIP may have been modified by LinkNext + ld 10, 88(1) + + // Deallocate save area + addi 1, 1, 128 + + // Jump to resolved target + mtctr 12 + bctr +.size ppc64le_next, .-ppc64le_next diff --git a/src/dynarec/ppc64le/ppc64le_printer.c b/src/dynarec/ppc64le/ppc64le_printer.c new file mode 100644 index 0000000000..40fcdc0a94 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_printer.c @@ -0,0 +1,1362 @@ +#include +#include +#include + +#include "ppc64le_printer.h" + +// PPC64LE instruction disassembler for box64 dynarec debugging. +// Uses switch-based decoding on primary opcode (bits 26-31) then extended opcode. +// Register names use box64 symbolic names (xRAX, xEmu, etc.) for readability. + +static char buff[200]; + +// ---- Field extraction macros ---- +// PPC64LE stores instructions as little-endian uint32_t on LE hosts. +// The opcode passed in has already been read as a native uint32_t, +// so bit numbering is: bit 31 = MSB (PPC ISA bit 0), bit 0 = LSB (PPC ISA bit 31). + +// Primary opcode: bits 26-31 +#define OPCD(op) (((op) >> 26) & 0x3F) +// D-form / DS-form fields +#define RT(op) (((op) >> 21) & 0x1F) +#define RS(op) RT(op) +#define RA(op) (((op) >> 16) & 0x1F) +#define RB(op) (((op) >> 11) & 0x1F) +#define SI(op) ((int16_t)((op) & 0xFFFF)) +#define UI(op) ((op) & 0xFFFF) +#define DS(op) ((int16_t)((op) & 0xFFFC)) +#define DS_XO(op) ((op) & 0x3) +// I-form +#define LI(op) ((int32_t)(((op) & 0x03FFFFFC) << 6) >> 6) +#define AA(op) (((op) >> 1) & 1) +#define LK(op) ((op) & 1) +// B-form +#define BO(op) RT(op) +#define BI_F(op) RA(op) +#define BD(op) ((int16_t)((op) & 0xFFFC)) +// X-form +#define XO_X(op) (((op) >> 1) & 0x3FF) +#define RC(op) ((op) & 1) +// XO-form +#define XO_XO(op) (((op) >> 1) & 0x1FF) +#define OE(op) (((op) >> 10) & 1) +// XL-form +#define XO_XL(op) XO_X(op) +// M-form +#define SH_M(op) RB(op) +#define MB_M(op) (((op) >> 6) & 0x1F) +#define ME_M(op) (((op) >> 1) & 0x1F) +// MD-form: sh = sh[0:4] || sh[5], mb/me = mbe[0:4] || mbe[5] +#define SH_MD(op) ((RB(op)) | (((op) >> 1) & 1) << 5) +#define MB_MD(op) ((((op) >> 6) & 0x1F) | (((op) >> 5) & 1) << 5) +#define ME_MD(op) MB_MD(op) +#define XO_MD(op) (((op) >> 2) & 0x7) +// MDS-form +#define XO_MDS(op) (((op) >> 1) & 0xF) +// XS-form: sh = sh[0:4] || sh[5] +#define SH_XS(op) SH_MD(op) +#define XO_XS(op) (((op) >> 2) & 0x1FF) +// XFX-form: spr = spr[5:9] || spr[0:4] (swapped halves) +#define SPR(op) ((((op) >> 16) & 0x1F) | (((op) >> 6) & 0x3E0)) +#define FXM(op) (((op) >> 12) & 0xFF) +// XFL-form +#define FLM(op) (((op) >> 17) & 0xFF) +// A-form +#define FRA(op) RA(op) +#define FRB(op) RB(op) +#define FRC(op) (((op) >> 6) & 0x1F) +#define XO_A(op) (((op) >> 1) & 0x1F) +// VA-form +#define VRA(op) RA(op) +#define VRB(op) RB(op) +#define VRC(op) (((op) >> 6) & 0x1F) +#define XO_VA(op) ((op) & 0x3F) +// VX-form +#define XO_VX(op) ((op) & 0x7FF) +// DQ-form: T = T[0:4] || TX (bit 28), dq = bits 4-15 << 4 +#define DQ(op) ((int16_t)(((op) & 0xFFF0))) +#define DQ_TX(op) (((op) >> 28) & 1) +#define DQ_T(op) ((RT(op) << 1) | DQ_TX(op)) +#define DQ_XO(op) ((op) & 0x7) // bits 0-2 but typically bit 3 is part of dq — actually XO is bits 0-2 +// XX1-form: T = T[0:4](bits 25-21) || TX(bit 0) +#define XX1_TX(op) ((op) & 1) +#define XX1_T(op) ((RT(op) << 1) | XX1_TX(op)) +#define XX1_XO(op) (((op) >> 1) & 0x3FF) +// For XX1 with AX/BX: these don't apply to XX1 (XX1 only has TX) +// XX2-form: T[0:4]=bits25-21, B[0:4]=bits15-11, TX=bit0, BX=bit1 +#define XX2_BX(op) (((op) >> 1) & 1) +#define XX2_TX(op) ((op) & 1) +#define XX2_T(op) ((RT(op) << 1) | XX2_TX(op)) +#define XX2_B(op) ((RB(op) << 1) | XX2_BX(op)) +#define XX2_XO(op) (((op) >> 2) & 0x1FF) +#define XX2_UIM(op) (((op) >> 16) & 0x3) // for XXSPLTW +// XX3-form: T[0:4]=bits25-21, A[0:4]=bits20-16, B[0:4]=bits15-11, TX=bit0, AX=bit2, BX=bit1 +#define XX3_AX(op) (((op) >> 2) & 1) +#define XX3_BX(op) (((op) >> 1) & 1) +#define XX3_TX(op) ((op) & 1) +#define XX3_T(op) ((RT(op) << 1) | XX3_TX(op)) +#define XX3_A(op) ((RA(op) << 1) | XX3_AX(op)) +#define XX3_B(op) ((RB(op) << 1) | XX3_BX(op)) +#define XX3_XO(op) (((op) >> 3) & 0xFF) +// XX4-form: T[0:4]=bits25-21, A[0:4]=bits20-16, B[0:4]=bits15-11, C[0:4]=bits10-6, TX=bit0, AX=bit2, BX=bit1, CX=bit3... no, actually: +// XX4: opcd(6)|T(5)|A(5)|B(5)|C(5)|XO(2)|CX(1)|AX(1)|BX(1)|TX(1) +#define XX4_TX(op) ((op) & 1) +#define XX4_BX(op) (((op) >> 1) & 1) +#define XX4_AX(op) (((op) >> 2) & 1) +#define XX4_CX(op) (((op) >> 3) & 1) +#define XX4_T(op) ((RT(op) << 1) | XX4_TX(op)) +#define XX4_A(op) ((RA(op) << 1) | XX4_AX(op)) +#define XX4_B(op) ((RB(op) << 1) | XX4_BX(op)) +#define XX4_C(op) (((((op) >> 6) & 0x1F) << 1) | XX4_CX(op)) +#define XX4_XO(op) (((op) >> 4) & 0x3) + +// Helpers +#define BF(op) (((op) >> 23) & 0x7) +#define L_CMP(op) (((op) >> 21) & 1) + +static int signExtend(uint32_t val, int bits) { + int shift = 32 - bits; + return ((int32_t)(val << shift)) >> shift; +} + +// ---- Register name tables ---- +// GPR names with box64 symbolic aliases +static const char* Rt[32] = { + "r0", "sp", "toc", "x1", "x2", "x3", "x4", "x5", // r0-r7 + "x6", "xRIP", "x7", "r11", "r12", "r13", "xRAX", "xRCX", // r8-r15 + "xRDX", "xRBX", "xRSP", "xRBP", "xRSI", "xRDI", "xR8", "xR9", // r16-r23 + "xR10", "xR11", "xR12", "xR13", "xR14", "xR15", "xFlags","xEmu" // r24-r31 +}; + +// FPR names +static const char* Ft[32] = { + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", + "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31" +}; + +// VMX/VR names +static const char* Vt[32] = { + "vr0", "vr1", "vr2", "vr3", "vr4", "vr5", "vr6", "vr7", + "vr8", "vr9", "vr10", "vr11", "vr12", "vr13", "vr14", "vr15", + "vr16", "vr17", "vr18", "vr19", "vr20", "vr21", "vr22", "vr23", + "vr24", "vr25", "vr26", "vr27", "vr28", "vr29", "vr30", "vr31" +}; + +// VSX register names (vs0-vs63) +static const char* VSXname(int idx) { + static char vbuf[8]; + snprintf(vbuf, sizeof(vbuf), "vs%d", idx); + return vbuf; +} + +// CR field names +static const char* CRnames[8] = { + "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7" +}; + +// SPR names +static const char* SPRname(int spr) { + switch(spr) { + case 1: return "XER"; + case 8: return "LR"; + case 9: return "CTR"; + case 268: return "TB"; + default: { + static char sbuf[16]; + snprintf(sbuf, sizeof(sbuf), "SPR%d", spr); + return sbuf; + } + } +} + +// ---- Condition branch helpers ---- +static const char* condName(int bo, int bi) { + int cr = bi / 4; + int bit = bi % 4; + static char cbuf[16]; + const char* bitsuf[] = {"lt", "gt", "eq", "so"}; + if (bo == 12) { // branch if true + if (cr == 0) + snprintf(cbuf, sizeof(cbuf), "b%s", bitsuf[bit]); + else + snprintf(cbuf, sizeof(cbuf), "b%s %s", bitsuf[bit], CRnames[cr]); + } else if (bo == 4) { // branch if false + const char* negbitsuf[] = {"ge", "le", "ne", "nso"}; + if (cr == 0) + snprintf(cbuf, sizeof(cbuf), "b%s", negbitsuf[bit]); + else + snprintf(cbuf, sizeof(cbuf), "b%s %s", negbitsuf[bit], CRnames[cr]); + } else if (bo == 20) { + snprintf(cbuf, sizeof(cbuf), "b"); + } else { + snprintf(cbuf, sizeof(cbuf), "bc %d,%d", bo, bi); + } + return cbuf; +} + +// ---- Main disassembler ---- +const char* ppc64le_print(uint32_t opcode, uint64_t addr) +{ + int op = OPCD(opcode); + int rt = RT(opcode); + int rs = rt; + int ra = RA(opcode); + int rb = RB(opcode); + int rc_bit = RC(opcode); + const char* dot = rc_bit ? "." : ""; + + switch(op) { + // ---- opcode 2: TDI ---- + case 2: + snprintf(buff, sizeof(buff), "%-15s %d, %s, %d", "tdi", rt, Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 3: TWI ---- + case 3: + snprintf(buff, sizeof(buff), "%-15s %d, %s, %d", "twi", rt, Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 4: VMX/Altivec ---- + case 4: { + // Check VA-form first (bits 0-5) + int va_xo = XO_VA(opcode); + switch(va_xo) { + case 32: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmhaddshs", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 33: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmhraddshs", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 34: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmladduhm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 36: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmsumubm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 37: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmsummbm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 38: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmsumuhm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 40: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmsumshm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 41: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vmsumshs", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 42: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vsel", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 43: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "vperm", Vt[rt], Vt[ra], Vt[rb], Vt[VRC(opcode)]); return buff; + case 44: { + int shb = VRC(opcode) >> 1; // vsldoi shb is in bits 6-9 + shb = (opcode >> 6) & 0xF; + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d", "vsldoi", Vt[rt], Vt[ra], Vt[rb], shb); + return buff; + } + case 51: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "maddld", Rt[rt], Rt[ra], Rt[rb], Rt[VRC(opcode)]); return buff; + } + // VX-form (bits 0-10) + int vx_xo = XO_VX(opcode); + switch(vx_xo) { + // Integer add + case 0: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddubm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 64: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vadduhm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 128: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vadduwm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 192: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddudm", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Integer add saturate + case 512: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddubs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 576: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vadduhs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 768: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddsbs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 832: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddshs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 896: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vaddsws", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Integer subtract + case 1024: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsububm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1088: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubuhm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1152: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubuwm", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1216: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubudm", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Integer subtract saturate + case 1536: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsububs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1600: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubuhs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1792: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubsbs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1856: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsubshs", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Compare equal + case 6: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpequb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 70: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpequh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 134: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpequw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 199: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpequd", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Compare greater than signed + case 774: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtsb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 838: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtsh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 902: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtsw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 967: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtsd", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Compare greater than unsigned + case 518: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 582: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtuh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 646: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtuw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 711: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vcmpgtud", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Min/Max unsigned + case 514: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 578: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminuh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 642: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminuw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 706: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminud", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 2: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 66: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxuh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 130: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxuw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 194: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxud", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Min/Max signed + case 770: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminsb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 834: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminsh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 898: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminsw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 962: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vminsd", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 258: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxsb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 322: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxsh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 386: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxsw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 450: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmaxsd", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Multiply even/odd + case 520: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmuleub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 584: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmuleuh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 648: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmuleuw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 776: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulesb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 840: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulesh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 904: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulesw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 8: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmuloub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 72: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulouh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 136: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulouw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 264: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulosb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 328: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulosh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 392: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmulosw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 137: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmuluwm", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Pack + case 14: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkuhum", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 78: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkuwum", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 398: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkshss", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 462: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkswss", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 270: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkshus", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 334: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkswus", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1486: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpksdss", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1358: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpksdus", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1102: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vpkudum", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Unpack + case 526: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupkhsb", Vt[rt], Vt[rb]); return buff; + case 590: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupkhsh", Vt[rt], Vt[rb]); return buff; + case 1614: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupkhsw", Vt[rt], Vt[rb]); return buff; + case 654: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupklsb", Vt[rt], Vt[rb]); return buff; + case 718: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupklsh", Vt[rt], Vt[rb]); return buff; + case 1742: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vupklsw", Vt[rt], Vt[rb]); return buff; + // Extend sign (P9) — share XO 1538, distinguished by VRA field + case 1538: + switch(ra) { + case 16: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vextsb2w", Vt[rt], Vt[rb]); return buff; + case 17: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vextsh2w", Vt[rt], Vt[rb]); return buff; + case 24: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vextsb2d", Vt[rt], Vt[rb]); return buff; + case 25: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vextsh2d", Vt[rt], Vt[rb]); return buff; + case 26: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vextsw2d", Vt[rt], Vt[rb]); return buff; + case 6: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vnegw", Vt[rt], Vt[rb]); return buff; + case 7: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vnegd", Vt[rt], Vt[rb]); return buff; + default: snprintf(buff, sizeof(buff), "%-15s %s, %d, %s", "vx_1538", Vt[rt], ra, Vt[rb]); return buff; + } + // Merge + case 12: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrghb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 76: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrghh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 140: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrghw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 268: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrglb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 332: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrglh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 396: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vmrglw", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Shift + case 260: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vslb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 324: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vslh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 388: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vslw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1476: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsld", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 516: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 580: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 644: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1732: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrd", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 772: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrab", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 836: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrah", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 900: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsraw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 964: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsrad", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 452: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsl", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 708: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsr", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1036: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vslo", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1100: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsro", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Rotate + case 4: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vrlb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 68: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vrlh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 132: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vrlw", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 196: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vrld", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Splat + case 524: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vspltb", Vt[rt], Vt[rb], ra); return buff; + case 588: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vsplth", Vt[rt], Vt[rb], ra); return buff; + case 652: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vspltw", Vt[rt], Vt[rb], ra); return buff; + case 780: snprintf(buff, sizeof(buff), "%-15s %s, %d", "vspltisb", Vt[rt], signExtend(ra, 5)); return buff; + case 844: snprintf(buff, sizeof(buff), "%-15s %s, %d", "vspltish", Vt[rt], signExtend(ra, 5)); return buff; + case 908: snprintf(buff, sizeof(buff), "%-15s %s, %d", "vspltisw", Vt[rt], signExtend(ra, 5)); return buff; + // Logical + case 1028: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vand", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1092: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vandc", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1156: + if (ra == rb) + snprintf(buff, sizeof(buff), "%-15s %s, %s", "vmr", Vt[rt], Vt[ra]); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vor", Vt[rt], Vt[ra], Vt[rb]); + return buff; + case 1220: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vxor", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1284: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vnor", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1348: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vorc", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1412: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vnand", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1668: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "veqv", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Average + case 1026: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vavgub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1090: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vavguh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1282: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vavgsb", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1346: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vavgsh", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Absolute difference (P9) + case 1027: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vabsdub", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1091: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vabsduh", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1155: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vabsduw", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Sum across + case 1672: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsum4ubs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1800: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsum4sbs", Vt[rt], Vt[ra], Vt[rb]); return buff; + case 1608: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vsum4shs", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Count leading zeros + case 1794: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vclzb", Vt[rt], Vt[rb]); return buff; + case 1858: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vclzh", Vt[rt], Vt[rb]); return buff; + case 1922: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vclzw", Vt[rt], Vt[rb]); return buff; + case 1986: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vclzd", Vt[rt], Vt[rb]); return buff; + // Population count + case 1795: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vpopcntb", Vt[rt], Vt[rb]); return buff; + case 1859: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vpopcnth", Vt[rt], Vt[rb]); return buff; + case 1923: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vpopcntw", Vt[rt], Vt[rb]); return buff; + case 1987: snprintf(buff, sizeof(buff), "%-15s %s, %s", "vpopcntd", Vt[rt], Vt[rb]); return buff; + // Bit permute + case 1356: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "vbpermq", Vt[rt], Vt[ra], Vt[rb]); return buff; + // Extract/Insert (P9) - use ra as immediate + case 525: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vextractub", Vt[rt], Vt[rb], ra); return buff; + case 589: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vextractuh", Vt[rt], Vt[rb], ra); return buff; + case 653: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vextractuw", Vt[rt], Vt[rb], ra); return buff; + case 717: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vextractd", Vt[rt], Vt[rb], ra); return buff; + case 781: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vinsertb", Vt[rt], Vt[rb], ra); return buff; + case 845: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vinserth", Vt[rt], Vt[rb], ra); return buff; + case 909: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vinsertw", Vt[rt], Vt[rb], ra); return buff; + case 973: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vinsertd", Vt[rt], Vt[rb], ra); return buff; + // Convert + case 778: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vcfux", Vt[rt], Vt[rb], ra); return buff; + case 842: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vcfsx", Vt[rt], Vt[rb], ra); return buff; + case 906: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vctuxs", Vt[rt], Vt[rb], ra); return buff; + case 970: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "vctsxs", Vt[rt], Vt[rb], ra); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s (vx_xo=%d)", opcode, "vmx???", vx_xo); + return buff; + } + + // ---- opcode 7: MULLI ---- + case 7: + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "mulli", Rt[rt], Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 8: SUBFIC ---- + case 8: + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "subfic", Rt[rt], Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 10: CMPLI ---- + case 10: { + int bf = BF(opcode); + int l = L_CMP(opcode); + const char* mn = l ? "cmpldi" : "cmplwi"; + if (bf == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %u", mn, Rt[ra], UI(opcode)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %u", mn, CRnames[bf], Rt[ra], UI(opcode)); + return buff; + } + + // ---- opcode 11: CMPI ---- + case 11: { + int bf = BF(opcode); + int l = L_CMP(opcode); + const char* mn = l ? "cmpdi" : "cmpwi"; + if (bf == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %d", mn, Rt[ra], SI(opcode)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", mn, CRnames[bf], Rt[ra], SI(opcode)); + return buff; + } + + // ---- opcode 12: ADDIC ---- + case 12: + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "addic", Rt[rt], Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 13: ADDIC. ---- + case 13: + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "addic.", Rt[rt], Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 14: ADDI / LI ---- + case 14: + if (ra == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %d", "li", Rt[rt], SI(opcode)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "addi", Rt[rt], Rt[ra], SI(opcode)); + return buff; + + // ---- opcode 15: ADDIS / LIS ---- + case 15: + if (ra == 0) + snprintf(buff, sizeof(buff), "%-15s %s, 0x%X", "lis", Rt[rt], UI(opcode)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "addis", Rt[rt], Rt[ra], UI(opcode)); + return buff; + + // ---- opcode 16: BC (conditional branch) ---- + case 16: { + int bo = BO(opcode); + int bi = BI_F(opcode); + int bd = BD(opcode); + int lk = LK(opcode); + int aa = AA(opcode); + const char* cn = condName(bo, bi); + if (aa) + snprintf(buff, sizeof(buff), "%-15s 0x%X%s", cn, (unsigned)(bd & 0xFFFF), lk ? "l" : ""); + else + snprintf(buff, sizeof(buff), "%-15s #%+d\t; %p%s", cn, bd, (void*)(addr + bd), lk ? " (link)" : ""); + return buff; + } + + // ---- opcode 17: SC ---- + case 17: + snprintf(buff, sizeof(buff), "%-15s", "sc"); + return buff; + + // ---- opcode 18: B/BL/BA/BLA ---- + case 18: { + int li = LI(opcode); + int lk = LK(opcode); + int aa = AA(opcode); + if (aa) + snprintf(buff, sizeof(buff), "%-15s 0x%X", lk ? "bla" : "ba", (unsigned)(li & 0x03FFFFFF)); + else + snprintf(buff, sizeof(buff), "%-15s #%+d\t; %p", lk ? "bl" : "b", li, (void*)(addr + li)); + return buff; + } + + // ---- opcode 19: XL-form (Branch to LR/CTR, CR logical, isync) ---- + case 19: { + int xo = XO_XL(opcode); + int bo = BO(opcode); + int bi = BI_F(opcode); + int lk = LK(opcode); + switch(xo) { + case 16: // BCLR + if (bo == 20 && bi == 0) + snprintf(buff, sizeof(buff), "%-15s", lk ? "blrl" : "blr"); + else { + const char* cn = condName(bo, bi); + snprintf(buff, sizeof(buff), "%slr%s", cn, lk ? "l" : ""); + } + return buff; + case 528: // BCCTR + if (bo == 20 && bi == 0) + snprintf(buff, sizeof(buff), "%-15s", lk ? "bctrl" : "bctr"); + else { + const char* cn = condName(bo, bi); + snprintf(buff, sizeof(buff), "%sctr%s", cn, lk ? "l" : ""); + } + return buff; + case 150: + snprintf(buff, sizeof(buff), "%-15s", "isync"); + return buff; + // CR logical + case 257: snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crand", rt, ra, rb); return buff; + case 129: snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crandc", rt, ra, rb); return buff; + case 449: + if (ra == rb) + snprintf(buff, sizeof(buff), "%-15s %d, %d", "crmove", rt, ra); + else + snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "cror", rt, ra, rb); + return buff; + case 417: snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crorc", rt, ra, rb); return buff; + case 193: + if (ra == rt && rb == rt) + snprintf(buff, sizeof(buff), "%-15s %d", "crclr", rt); + else + snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crxor", rt, ra, rb); + return buff; + case 225: snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crnand", rt, ra, rb); return buff; + case 33: + if (ra == rb) + snprintf(buff, sizeof(buff), "%-15s %d, %d", "crnot", rt, ra); + else + snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "crnor", rt, ra, rb); + return buff; + case 289: + if (ra == rt && rb == rt) + snprintf(buff, sizeof(buff), "%-15s %d", "crset", rt); + else + snprintf(buff, sizeof(buff), "%-15s %d, %d, %d", "creqv", rt, ra, rb); + return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s (xl_xo=%d)", opcode, "xl???", xo); + return buff; + } + + // ---- opcode 20: RLWIMI ---- + case 20: { + int sh = SH_M(opcode); + int mb = MB_M(opcode); + int me = ME_M(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d, %d", rc_bit ? "rlwimi." : "rlwimi", Rt[ra], Rt[rs], sh, mb, me); + return buff; + } + + // ---- opcode 21: RLWINM ---- + case 21: { + int sh = SH_M(opcode); + int mb = MB_M(opcode); + int me = ME_M(opcode); + // Detect pseudo-ops + if (mb == 0 && me == (31 - sh)) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "slwi." : "slwi", Rt[ra], Rt[rs], sh); + else if (sh == (32 - mb) && me == 31) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "srwi." : "srwi", Rt[ra], Rt[rs], mb); + else if (sh == 0 && me == 31) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "clrlwi." : "clrlwi", Rt[ra], Rt[rs], mb); + else if (sh == 0 && mb == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "clrrwi." : "clrrwi", Rt[ra], Rt[rs], 31 - me); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d, %d", rc_bit ? "rlwinm." : "rlwinm", Rt[ra], Rt[rs], sh, mb, me); + return buff; + } + + // ---- opcode 23: RLWNM ---- + case 23: { + int mb = MB_M(opcode); + int me = ME_M(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d, %d", rc_bit ? "rlwnm." : "rlwnm", Rt[ra], Rt[rs], Rt[rb], mb, me); + return buff; + } + + // ---- opcode 24: ORI ---- + case 24: + if (rs == 0 && ra == 0 && UI(opcode) == 0) + snprintf(buff, sizeof(buff), "%-15s", "nop"); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "ori", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 25: ORIS ---- + case 25: + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "oris", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 26: XORI ---- + case 26: + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "xori", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 27: XORIS ---- + case 27: + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "xoris", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 28: ANDI. ---- + case 28: + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "andi.", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 29: ANDIS. ---- + case 29: + snprintf(buff, sizeof(buff), "%-15s %s, %s, 0x%X", "andis.", Rt[ra], Rt[rs], UI(opcode)); + return buff; + + // ---- opcode 30: MD/MDS-form (64-bit rotate) ---- + case 30: { + int md_xo = XO_MD(opcode); + int sh = SH_MD(opcode); + int mbe = MB_MD(opcode); + switch(md_xo) { + case 0: // RLDICL + if (sh == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "clrldi." : "clrldi", Rt[ra], Rt[rs], mbe); + else if (mbe == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "rotldi." : "rotldi", Rt[ra], Rt[rs], sh); + else if (mbe == (64 - sh)) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "srdi." : "srdi", Rt[ra], Rt[rs], 64 - sh); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d", rc_bit ? "rldicl." : "rldicl", Rt[ra], Rt[rs], sh, mbe); + return buff; + case 1: // RLDICR + if (mbe == (63 - sh)) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "sldi." : "sldi", Rt[ra], Rt[rs], sh); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d", rc_bit ? "rldicr." : "rldicr", Rt[ra], Rt[rs], sh, mbe); + return buff; + case 2: // RLDIC + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d", rc_bit ? "rldic." : "rldic", Rt[ra], Rt[rs], sh, mbe); + return buff; + case 3: // RLDIMI + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d, %d", rc_bit ? "rldimi." : "rldimi", Rt[ra], Rt[rs], sh, mbe); + return buff; + } + // MDS-form: xo is bits 1-4 + int mds_xo = XO_MDS(opcode); + switch(mds_xo) { + case 8: // RLDCL + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d", rc_bit ? "rldcl." : "rldcl", Rt[ra], Rt[rs], Rt[rb], mbe); + return buff; + case 9: // RLDCR + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d", rc_bit ? "rldcr." : "rldcr", Rt[ra], Rt[rs], Rt[rb], mbe); + return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s", opcode, "rld???"); + return buff; + } + + // ---- opcode 31: X/XO/XFX-form (the big one) ---- + case 31: { + int xo10 = XO_X(opcode); // 10-bit extended opcode + int xo9 = XO_XO(opcode); // 9-bit extended opcode (XO-form) + int oe = OE(opcode); + + // XO-form instructions (9-bit xo, OE bit) + // Check XO-form first by matching on the 9-bit xo + // ADD/SUB/MUL/DIV family + switch(xo9) { + case 266: // ADD + if (oe) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "addo." : "addo", Rt[rt], Rt[ra], Rt[rb]); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "add." : "add", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 10: // ADDC + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "addc." : "addc", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 138: // ADDE + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "adde." : "adde", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 202: // ADDZE + snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "addze." : "addze", Rt[rt], Rt[ra]); + return buff; + case 40: // SUBF + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "subf." : "subf", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 8: // SUBFC + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "subfc." : "subfc", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 136: // SUBFE + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "subfe." : "subfe", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 200: // SUBFZE + snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "subfze." : "subfze", Rt[rt], Rt[ra]); + return buff; + case 104: // NEG + snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "neg." : "neg", Rt[rt], Rt[ra]); + return buff; + case 235: // MULLW + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "mullw." : "mullw", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 75: // MULHW + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "mulhw." : "mulhw", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 11: // MULHWU + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "mulhwu." : "mulhwu", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 233: // MULLD + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "mulld." : "mulld", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 73: // MULHD + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "mulhd", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 9: // MULHDU + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "mulhdu", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 491: // DIVW + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "divw", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 459: // DIVWU + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "divwu", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 489: // DIVD + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "divd", Rt[rt], Rt[ra], Rt[rb]); + return buff; + case 457: // DIVDU + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "divdu", Rt[rt], Rt[ra], Rt[rb]); + return buff; + } + + // X-form instructions (10-bit xo) + switch(xo10) { + // Compare + case 0: { + int bf = BF(opcode); + int l = (rt >> 0) & 1; // L bit is bit 21 + l = L_CMP(opcode); + const char* mn = l ? "cmpd" : "cmpw"; + if (bf == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s", mn, Rt[ra], Rt[rb]); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", mn, CRnames[bf], Rt[ra], Rt[rb]); + return buff; + } + case 32: { + int bf = BF(opcode); + int l = L_CMP(opcode); + const char* mn = l ? "cmpld" : "cmplw"; + if (bf == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s", mn, Rt[ra], Rt[rb]); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", mn, CRnames[bf], Rt[ra], Rt[rb]); + return buff; + } + // TW/TD + case 4: snprintf(buff, sizeof(buff), "%-15s %d, %s, %s", "tw", rt, Rt[ra], Rt[rb]); return buff; + case 68: snprintf(buff, sizeof(buff), "%-15s %d, %s, %s", "td", rt, Rt[ra], Rt[rb]); return buff; + // MFCR + case 19: { + int fxm = FXM(opcode); + int one_bit = (opcode >> 20) & 1; // bit 20 distinguishes mfcr/mfocrf + if (one_bit) + snprintf(buff, sizeof(buff), "%-15s %s, 0x%02X", "mfocrf", Rt[rt], fxm); + else + snprintf(buff, sizeof(buff), "%-15s %s", "mfcr", Rt[rt]); + return buff; + } + // MTCRF / MTOCRF + case 144: { + int fxm = FXM(opcode); + int one_bit = (opcode >> 20) & 1; + if (one_bit) + snprintf(buff, sizeof(buff), "%-15s 0x%02X, %s", "mtocrf", fxm, Rt[rs]); + else if (fxm == 0xFF) + snprintf(buff, sizeof(buff), "%-15s %s", "mtcr", Rt[rs]); + else + snprintf(buff, sizeof(buff), "%-15s 0x%02X, %s", "mtcrf", fxm, Rt[rs]); + return buff; + } + // Logical + case 28: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "and." : "and", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 60: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "andc." : "andc", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 444: + if (rs == rb) { + if (rs == ra) + snprintf(buff, sizeof(buff), "%-15s", "nop"); // or r,r,r = nop + else + snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "mr." : "mr", Rt[ra], Rt[rs]); + } else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "or." : "or", Rt[ra], Rt[rs], Rt[rb]); + return buff; + case 412: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "orc", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 316: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "xor." : "xor", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 476: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "nand", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 124: + if (rs == rb) + snprintf(buff, sizeof(buff), "%-15s %s, %s", "not", Rt[ra], Rt[rs]); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "nor", Rt[ra], Rt[rs], Rt[rb]); + return buff; + case 284: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "eqv", Rt[ra], Rt[rs], Rt[rb]); return buff; + // Extend/Count + case 954: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "extsb." : "extsb", Rt[ra], Rt[rs]); return buff; + case 922: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "extsh." : "extsh", Rt[ra], Rt[rs]); return buff; + case 986: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "extsw." : "extsw", Rt[ra], Rt[rs]); return buff; + case 26: snprintf(buff, sizeof(buff), "%-15s %s, %s", "cntlzw", Rt[ra], Rt[rs]); return buff; + case 58: snprintf(buff, sizeof(buff), "%-15s %s, %s", "cntlzd", Rt[ra], Rt[rs]); return buff; + case 538: snprintf(buff, sizeof(buff), "%-15s %s, %s", "cnttzw", Rt[ra], Rt[rs]); return buff; + case 570: snprintf(buff, sizeof(buff), "%-15s %s, %s", "cnttzd", Rt[ra], Rt[rs]); return buff; + case 378: snprintf(buff, sizeof(buff), "%-15s %s, %s", "popcntw", Rt[ra], Rt[rs]); return buff; + case 506: snprintf(buff, sizeof(buff), "%-15s %s, %s", "popcntd", Rt[ra], Rt[rs]); return buff; + case 122: snprintf(buff, sizeof(buff), "%-15s %s, %s", "popcntb", Rt[ra], Rt[rs]); return buff; + // Shift + case 24: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "slw", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 536: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "srw", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 792: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "sraw." : "sraw", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 824: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "srawi." : "srawi", Rt[ra], Rt[rs], rb); return buff; + case 27: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "sld", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 539: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "srd", Rt[ra], Rt[rs], Rt[rb]); return buff; + case 794: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "srad." : "srad", Rt[ra], Rt[rs], Rt[rb]); return buff; + // SRADI / EXTSWSLI (XS-form, but primary opcode is still 31) + // SRADI xo = 413, in XS encoding + // The 10-bit xo actually is 413<<1|sh5 for XS-form, but we already extract xo10 as bits 1-10 + // For SRADI: bits 2-10 = 413 (9-bit), bit 1 = sh[5] + // XS_XO = bits 2-10 = 9-bit + // Let's check by the 9-bit: (xo10 >> 1) for the top 9 bits + // Actually SRADI occupies xo10 = 826 or 827 (bit 0 = sh[5]) + case 826: case 827: { + int sh6 = SH_XS(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "sradi." : "sradi", Rt[ra], Rt[rs], sh6); + return buff; + } + // EXTSWSLI: xo = 445 in XS 9-bit → xo10 = 890 or 891 + case 890: case 891: { + int sh6 = SH_XS(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", rc_bit ? "extswsli." : "extswsli", Rt[ra], Rt[rs], sh6); + return buff; + } + // Modulo (P9) + case 267: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "moduw", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 779: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "modsw", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 265: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "modud", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 777: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "modsd", Rt[rt], Rt[ra], Rt[rb]); return buff; + // Load indexed + case 87: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lbzx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 279: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lhzx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 343: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lhax", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 23: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lwzx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 341: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lwax", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 21: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "ldx", Rt[rt], Rt[ra], Rt[rb]); return buff; + // Store indexed + case 215: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stbx", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 407: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "sthx", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 151: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stwx", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 149: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stdx", Rt[rs], Rt[ra], Rt[rb]); return buff; + // Byte-reverse load/store + case 790: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lhbrx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 534: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lwbrx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 532: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "ldbrx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 918: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "sthbrx", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 662: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stwbrx", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 660: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stdbrx", Rt[rs], Rt[ra], Rt[rb]); return buff; + // Load/Store with reservation (atomics) + case 52: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lbarx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 116: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lharx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 20: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lwarx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 84: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "ldarx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 276: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lqarx", Rt[rt], Rt[ra], Rt[rb]); return buff; + case 694: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stbcx.", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 726: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "sthcx.", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 150: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stwcx.", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 214: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stdcx.", Rt[rs], Rt[ra], Rt[rb]); return buff; + case 182: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stqcx.", Rt[rs], Rt[ra], Rt[rb]); return buff; + // FP indexed load/store + case 535: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lfsx", Ft[rt], Rt[ra], Rt[rb]); return buff; + case 599: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lfdx", Ft[rt], Rt[ra], Rt[rb]); return buff; + case 663: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stfsx", Ft[rs], Rt[ra], Rt[rb]); return buff; + case 727: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stfdx", Ft[rs], Rt[ra], Rt[rb]); return buff; + // VMX indexed load/store + case 103: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lvx", Vt[rt], Rt[ra], Rt[rb]); return buff; + case 231: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stvx", Vt[rs], Rt[ra], Rt[rb]); return buff; + // Sync / barriers + case 598: { + int l_field = (opcode >> 21) & 3; + if (l_field == 0) + snprintf(buff, sizeof(buff), "%-15s", "sync"); + else if (l_field == 1) + snprintf(buff, sizeof(buff), "%-15s", "lwsync"); + else + snprintf(buff, sizeof(buff), "%-15s %d", "sync", l_field); + return buff; + } + case 854: snprintf(buff, sizeof(buff), "%-15s", "eieio"); return buff; + // Cache management + case 54: snprintf(buff, sizeof(buff), "%-15s %s, %s", "dcbst", Rt[ra], Rt[rb]); return buff; + case 86: snprintf(buff, sizeof(buff), "%-15s %s, %s", "dcbf", Rt[ra], Rt[rb]); return buff; + case 982: snprintf(buff, sizeof(buff), "%-15s %s, %s", "icbi", Rt[ra], Rt[rb]); return buff; + // SPR move + case 339: { + int spr = SPR(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s", "mfspr", Rt[rt], SPRname(spr)); + return buff; + } + case 467: { + int spr = SPR(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s", "mtspr", SPRname(spr), Rt[rs]); + return buff; + } + // ISEL — special encoding: xo = BC<<1 | 15 (bits 1-5 = 01111, bits 6-10 = BC) + // The 10-bit xo for ISEL has bits 1-5 = 01111 = 15 + // So xo10 & 0x1F = 15 and BC = (xo10 >> 5) & 0x1F + default: + if ((xo10 & 0x1F) == 15) { + int bc = (xo10 >> 5) & 0x1F; + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d", "isel", Rt[rt], Rt[ra], Rt[rb], bc); + return buff; + } + break; + } + // XX1-form instructions under opcode 31 + { + int xx1_xo = XX1_XO(opcode); + int xt = XX1_T(opcode); + switch(xx1_xo) { + case 51: snprintf(buff, sizeof(buff), "%-15s %s, %s", "mfvsrd", Rt[ra], VSXname(xt)); return buff; + case 179: snprintf(buff, sizeof(buff), "%-15s %s, %s", "mtvsrd", VSXname(xt), Rt[ra]); return buff; + case 115: snprintf(buff, sizeof(buff), "%-15s %s, %s", "mfvsrwz", Rt[ra], VSXname(xt)); return buff; + case 243: snprintf(buff, sizeof(buff), "%-15s %s, %s", "mtvsrwz", VSXname(xt), Rt[ra]); return buff; + case 435: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "mtvsrdd", VSXname(xt), Rt[ra], Rt[rb]); return buff; + case 307: snprintf(buff, sizeof(buff), "%-15s %s, %s", "mfvsrld", Rt[ra], VSXname(xt)); return buff; + case 268: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "lxvx", VSXname(xt), Rt[ra], Rt[rb]); return buff; + case 396: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "stxvx", VSXname(xt), Rt[ra], Rt[rb]); return buff; + } + } + snprintf(buff, sizeof(buff), "%08X %-9s (xo=%d)", opcode, "x???", xo10); + return buff; + } + + // ---- opcode 32: LWZ ---- + case 32: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lwz", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 33: LWZU ---- + case 33: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lwzu", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 34: LBZ ---- + case 34: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lbz", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 35: LBZU ---- + case 35: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lbzu", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 36: STW ---- + case 36: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stw", Rt[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 37: STWU ---- + case 37: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stwu", Rt[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 38: STB ---- + case 38: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stb", Rt[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 39: STBU ---- + case 39: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stbu", Rt[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 40: LHZ ---- + case 40: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lhz", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 42: LHA ---- + case 42: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lha", Rt[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 44: STH ---- + case 44: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "sth", Rt[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 48: LFS ---- + case 48: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lfs", Ft[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 50: LFD ---- + case 50: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lfd", Ft[rt], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 52: STFS ---- + case 52: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stfs", Ft[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 54: STFD ---- + case 54: + snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stfd", Ft[rs], SI(opcode), Rt[ra]); + return buff; + + // ---- opcode 58: DS-form (LD, LDU, LWA) ---- + case 58: { + int ds_xo = DS_XO(opcode); + int offset = DS(opcode); + switch(ds_xo) { + case 0: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "ld", Rt[rt], offset, Rt[ra]); return buff; + case 1: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "ldu", Rt[rt], offset, Rt[ra]); return buff; + case 2: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lwa", Rt[rt], offset, Rt[ra]); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s", opcode, "ds58???"); + return buff; + } + + // ---- opcode 59: A-form single-precision FP ---- + case 59: { + int a_xo = XO_A(opcode); + int frt = rt, fra = ra, frb = rb, frc = FRC(opcode); + switch(a_xo) { + case 18: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fdivs." : "fdivs", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 20: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fsubs." : "fsubs", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 21: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fadds." : "fadds", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 22: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "fsqrts." : "fsqrts", Ft[frt], Ft[frb]); return buff; + case 25: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fmuls." : "fmuls", Ft[frt], Ft[fra], Ft[frc]); return buff; + // FCFIDS (opcode 59 xo 846) + // A-form xo is only 5 bits (bits 1-5), but FCFIDS uses X-form under opcode 59 + // Actually FCFIDS: opcode=59, xo=846 in X-form + } + // X-form under opcode 59 (10-bit xo) + int x59_xo = XO_X(opcode); + switch(x59_xo) { + case 846: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fcfids", Ft[rt], Ft[rb]); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s (a_xo=%d)", opcode, "fp59???", a_xo); + return buff; + } + + // ---- opcode 60: XX2/XX3/XX4-form (VSX operations) ---- + case 60: { + // Try XX3-form first (8-bit xo, bits 3-10) + int xx3_xo = XX3_XO(opcode); + int xt3 = XX3_T(opcode); + int xa3 = XX3_A(opcode); + int xb3 = XX3_B(opcode); + + switch(xx3_xo) { + // Logical + case 146: + if (xa3 == xb3) + snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxmr", VSXname(xt3), VSXname(xa3)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlor", VSXname(xt3), VSXname(xa3), VSXname(xb3)); + return buff; + case 154: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlxor", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 130: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxland", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 138: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlandc", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 162: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlnor", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 170: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlorc", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 178: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxlnand", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 186: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxleqv", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + // Permute/merge — note: dm is encoded in bits 8-9 of the XO, so different dm values + // give different xx3_xo. xxpermdi base=10, xxmrghd=xxpermdi(dm=0)=10, xxmrgld=xxpermdi(dm=1)=42 + case 10: case 42: case 74: case 106: { + int dm = (xx3_xo >> 5) & 3; + if (dm == 0) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxmrghd", VSXname(xt3), VSXname(xa3), VSXname(xb3)); + else if (dm == 1 && xa3 == xb3) + snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxswapd", VSXname(xt3), VSXname(xa3)); + else if (dm == 3) + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xxmrgld", VSXname(xt3), VSXname(xa3), VSXname(xb3)); + else + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %d", "xxpermdi", VSXname(xt3), VSXname(xa3), VSXname(xb3), dm); + return buff; + } + // FP double vector arithmetic + case 96: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvadddp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 104: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvsubdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 112: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvmuldp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 120: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvdivdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 224: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvmaxdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 232: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvmindp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 99: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpeqdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 115: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpgedp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 107: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpgtdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + // FP single vector arithmetic + case 64: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvaddsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 72: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvsubsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 80: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvmulsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 88: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvdivsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 192: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvmaxsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 200: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvminsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 67: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpeqsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 83: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpgesp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 75: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xvcmpgtsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + // Scalar FP double arithmetic + case 32: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsadddp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 40: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xssubdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 48: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsmuldp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 56: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsdivdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 160: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsmaxdp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 168: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsmindp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + // Scalar FP single arithmetic + case 0: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsaddsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 8: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xssubsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 16: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsmulsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 24: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xsdivsp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + // Scalar compare + case 35: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xscmpudp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + case 43: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "xscmpodp", VSXname(xt3), VSXname(xa3), VSXname(xb3)); return buff; + } + + // XX4-form (bits 4-5 = xo) + int xx4_xo = XX4_XO(opcode); + if (xx4_xo == 3) { + int xt4 = XX4_T(opcode); + int xa4 = XX4_A(opcode); + int xb4 = XX4_B(opcode); + int xc4 = XX4_C(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", "xxsel", VSXname(xt4), VSXname(xa4), VSXname(xb4), VSXname(xc4)); + return buff; + } + + // XX2-form (9-bit xo, bits 2-10) + int xx2_xo = XX2_XO(opcode); + int xt2 = XX2_T(opcode); + int xb2 = XX2_B(opcode); + switch(xx2_xo) { + // Byte-reverse + case 471: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxbrh", VSXname(xt2), VSXname(xb2)); return buff; + case 475: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxbrw", VSXname(xt2), VSXname(xb2)); return buff; + case 503: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxbrd", VSXname(xt2), VSXname(xb2)); return buff; + case 507: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xxbrq", VSXname(xt2), VSXname(xb2)); return buff; + // Splat word + case 164: snprintf(buff, sizeof(buff), "%-15s %s, %s, %d", "xxspltw", VSXname(xt2), VSXname(xb2), XX2_UIM(opcode)); return buff; + // Vector sqrt + case 203: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvsqrtdp", VSXname(xt2), VSXname(xb2)); return buff; + case 139: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvsqrtsp", VSXname(xt2), VSXname(xb2)); return buff; + // Scalar sqrt + case 11: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xssqrtsp", VSXname(xt2), VSXname(xb2)); return buff; + case 75: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xssqrtdp", VSXname(xt2), VSXname(xb2)); return buff; + // Reciprocal estimate + case 154: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvresp", VSXname(xt2), VSXname(xb2)); return buff; + case 138: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrsqrtesp", VSXname(xt2), VSXname(xb2)); return buff; + // Scalar convert + case 265: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvdpsp", VSXname(xt2), VSXname(xb2)); return buff; + case 267: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvdpspn", VSXname(xt2), VSXname(xb2)); return buff; + case 329: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvspdp", VSXname(xt2), VSXname(xb2)); return buff; + case 331: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvspdpn", VSXname(xt2), VSXname(xb2)); return buff; + case 376: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvsxddp", VSXname(xt2), VSXname(xb2)); return buff; + case 312: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvsxdsp", VSXname(xt2), VSXname(xb2)); return buff; + case 344: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvdpsxds", VSXname(xt2), VSXname(xb2)); return buff; + case 88: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvdpsxws", VSXname(xt2), VSXname(xb2)); return buff; + case 328: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvdpuxds", VSXname(xt2), VSXname(xb2)); return buff; + case 360: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xscvuxddp", VSXname(xt2), VSXname(xb2)); return buff; + // Vector convert + case 393: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvdpsp", VSXname(xt2), VSXname(xb2)); return buff; + case 457: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvspdp", VSXname(xt2), VSXname(xb2)); return buff; + case 504: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvsxddp", VSXname(xt2), VSXname(xb2)); return buff; + case 472: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvdpsxds", VSXname(xt2), VSXname(xb2)); return buff; + case 216: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvdpsxws", VSXname(xt2), VSXname(xb2)); return buff; + case 248: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvsxwdp", VSXname(xt2), VSXname(xb2)); return buff; + case 152: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvspsxws", VSXname(xt2), VSXname(xb2)); return buff; + case 184: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvsxwsp", VSXname(xt2), VSXname(xb2)); return buff; + case 136: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvcvspuxws", VSXname(xt2), VSXname(xb2)); return buff; + // Vector round double + case 201: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrdpi", VSXname(xt2), VSXname(xb2)); return buff; + case 217: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrdpiz", VSXname(xt2), VSXname(xb2)); return buff; + case 235: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrdpic", VSXname(xt2), VSXname(xb2)); return buff; + case 233: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrdpip", VSXname(xt2), VSXname(xb2)); return buff; + case 249: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrdpim", VSXname(xt2), VSXname(xb2)); return buff; + // Vector round single + case 137: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrspi", VSXname(xt2), VSXname(xb2)); return buff; + case 153: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrspiz", VSXname(xt2), VSXname(xb2)); return buff; + case 171: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrspic", VSXname(xt2), VSXname(xb2)); return buff; + case 169: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrspip", VSXname(xt2), VSXname(xb2)); return buff; + case 185: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvrspim", VSXname(xt2), VSXname(xb2)); return buff; + // Scalar round double + case 73: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xsrdpi", VSXname(xt2), VSXname(xb2)); return buff; + case 89: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xsrdpiz", VSXname(xt2), VSXname(xb2)); return buff; + case 107: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xsrdpic", VSXname(xt2), VSXname(xb2)); return buff; + case 105: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xsrdpip", VSXname(xt2), VSXname(xb2)); return buff; + case 121: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xsrdpim", VSXname(xt2), VSXname(xb2)); return buff; + // Vector abs/neg + case 409: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvabssp", VSXname(xt2), VSXname(xb2)); return buff; + case 441: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvnegsp", VSXname(xt2), VSXname(xb2)); return buff; + case 473: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvabsdp", VSXname(xt2), VSXname(xb2)); return buff; + case 505: snprintf(buff, sizeof(buff), "%-15s %s, %s", "xvnegdp", VSXname(xt2), VSXname(xb2)); return buff; + } + + // XXSPLTIB: 60 | T(5) | 00 | IMM8(8) | 0101101000 | TX(1) + // bits 1-10 = 0101101000 = 360, XX2_XO = bits 2-10 = 010110100 = 180 + if (xx2_xo == 180) { // XXSPLTIB + int imm8 = (opcode >> 11) & 0xFF; + snprintf(buff, sizeof(buff), "%-15s %s, %d", "xxspltib", VSXname(xt2), imm8); + return buff; + } + + snprintf(buff, sizeof(buff), "%08X %-9s (xx3=%d,xx2=%d)", opcode, "vsx60???", xx3_xo, xx2_xo); + return buff; + } + + // ---- opcode 61: DQ-form (LXV, STXV) ---- + case 61: { + int dq_xo = opcode & 0x7; // bits 0-2 + int tx = (opcode >> 28) & 1; + int t6 = (rt << 1) | tx; + int dq = (int16_t)(opcode & 0xFFF0); + switch(dq_xo) { + case 1: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "lxv", VSXname(t6), dq, Rt[ra]); return buff; + case 5: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stxv", VSXname(t6), dq, Rt[ra]); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s (dq_xo=%d)", opcode, "dq???", dq_xo); + return buff; + } + + // ---- opcode 62: DS-form (STD, STDU) ---- + case 62: { + int ds_xo = DS_XO(opcode); + int offset = DS(opcode); + switch(ds_xo) { + case 0: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "std", Rt[rs], offset, Rt[ra]); return buff; + case 1: snprintf(buff, sizeof(buff), "%-15s %s, %d(%s)", "stdu", Rt[rs], offset, Rt[ra]); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s", opcode, "ds62???"); + return buff; + } + + // ---- opcode 63: X/A-form double-precision FP and FPSCR ---- + case 63: { + // A-form first (5-bit xo, bits 1-5) + int a_xo = XO_A(opcode); + int frt = rt, fra = ra, frb = rb, frc = FRC(opcode); + switch(a_xo) { + case 18: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fdiv." : "fdiv", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 20: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fsub." : "fsub", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 21: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fadd." : "fadd", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 22: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "fsqrt." : "fsqrt", Ft[frt], Ft[frb]); return buff; + case 23: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", rc_bit ? "fsel." : "fsel", Ft[frt], Ft[fra], Ft[frc], Ft[frb]); return buff; + case 25: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", rc_bit ? "fmul." : "fmul", Ft[frt], Ft[fra], Ft[frc]); return buff; + case 28: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", rc_bit ? "fmsub." : "fmsub", Ft[frt], Ft[fra], Ft[frc], Ft[frb]); return buff; + case 29: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", rc_bit ? "fmadd." : "fmadd", Ft[frt], Ft[fra], Ft[frc], Ft[frb]); return buff; + case 30: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", rc_bit ? "fnmsub." : "fnmsub", Ft[frt], Ft[fra], Ft[frc], Ft[frb]); return buff; + case 31: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s, %s", rc_bit ? "fnmadd." : "fnmadd", Ft[frt], Ft[fra], Ft[frc], Ft[frb]); return buff; + } + // X-form (10-bit xo) + int x63_xo = XO_X(opcode); + switch(x63_xo) { + // FP compare + case 0: { + int bf = BF(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "fcmpu", CRnames[bf], Ft[ra], Ft[rb]); + return buff; + } + case 32: { + int bf = BF(opcode); + snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "fcmpo", CRnames[bf], Ft[ra], Ft[rb]); + return buff; + } + // FP move/convert + case 72: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "fmr." : "fmr", Ft[frt], Ft[frb]); return buff; + case 264: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fabs", Ft[frt], Ft[frb]); return buff; + case 136: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fnabs", Ft[frt], Ft[frb]); return buff; + case 40: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fneg", Ft[frt], Ft[frb]); return buff; + case 8: snprintf(buff, sizeof(buff), "%-15s %s, %s, %s", "fcpsgn", Ft[frt], Ft[fra], Ft[frb]); return buff; + case 12: snprintf(buff, sizeof(buff), "%-15s %s, %s", rc_bit ? "frsp." : "frsp", Ft[frt], Ft[frb]); return buff; + // Convert + case 814: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctid", Ft[frt], Ft[frb]); return buff; + case 815: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctidz", Ft[frt], Ft[frb]); return buff; + case 14: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctiw", Ft[frt], Ft[frb]); return buff; + case 15: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctiwz", Ft[frt], Ft[frb]); return buff; + case 846: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fcfid", Ft[frt], Ft[frb]); return buff; + case 974: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fcfidu", Ft[frt], Ft[frb]); return buff; + case 942: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctidu", Ft[frt], Ft[frb]); return buff; + case 943: snprintf(buff, sizeof(buff), "%-15s %s, %s", "fctiduz", Ft[frt], Ft[frb]); return buff; + // Round + case 392: snprintf(buff, sizeof(buff), "%-15s %s, %s", "frin", Ft[frt], Ft[frb]); return buff; + case 424: snprintf(buff, sizeof(buff), "%-15s %s, %s", "friz", Ft[frt], Ft[frb]); return buff; + case 456: snprintf(buff, sizeof(buff), "%-15s %s, %s", "frip", Ft[frt], Ft[frb]); return buff; + case 488: snprintf(buff, sizeof(buff), "%-15s %s, %s", "frim", Ft[frt], Ft[frb]); return buff; + // FPSCR + case 583: snprintf(buff, sizeof(buff), "%-15s %s", rc_bit ? "mffs." : "mffs", Ft[frt]); return buff; + case 711: { + int flm = FLM(opcode); + snprintf(buff, sizeof(buff), "%-15s 0x%02X, %s", rc_bit ? "mtfsf." : "mtfsf", flm, Ft[frb]); + return buff; + } + case 134: { + int bf = BF(opcode); + int u = (opcode >> 12) & 0xF; + snprintf(buff, sizeof(buff), "%-15s %d, %d", "mtfsfi", bf, u); + return buff; + } + case 70: snprintf(buff, sizeof(buff), "%-15s %d", "mtfsb0", rt); return buff; + case 38: snprintf(buff, sizeof(buff), "%-15s %d", "mtfsb1", rt); return buff; + } + snprintf(buff, sizeof(buff), "%08X %-9s (x63_xo=%d,a_xo=%d)", opcode, "fp63???", x63_xo, a_xo); + return buff; + } + + default: + break; + } + + // Unknown instruction + snprintf(buff, sizeof(buff), "%08X ???", opcode); + return buff; +} diff --git a/src/dynarec/ppc64le/ppc64le_printer.h b/src/dynarec/ppc64le/ppc64le_printer.h new file mode 100644 index 0000000000..10d5055109 --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_printer.h @@ -0,0 +1,8 @@ +#ifndef __PPC64LE_PRINTER_H__ +#define __PPC64LE_PRINTER_H__ + +#include + +const char* ppc64le_print(uint32_t opcode, uint64_t addr); + +#endif //__PPC64LE_PRINTER_H__ diff --git a/src/dynarec/ppc64le/ppc64le_prolog.S b/src/dynarec/ppc64le/ppc64le_prolog.S new file mode 100644 index 0000000000..f6df8f4e6f --- /dev/null +++ b/src/dynarec/ppc64le/ppc64le_prolog.S @@ -0,0 +1,135 @@ +// PPC64LE prolog for dynarec +// Save callee-saved registers, load x86 state from emu struct, +// then jump to the dynarec block. +// Called with: +// r3 = pointer to x64emu_t (emu) +// r4 = address to jump to (dynarec block entry) +// +// ELFv2 ABI: r1=SP, r2=TOC (preserved), r13=TLS (preserved) +// Callee-saved GPRs: r14-r31 +// Callee-saved VMX: vr20-vr31 + +#define ASM_MAPPING +#include "ppc64le_mapping.h" + +.text +.align 4 + +.global ppc64le_prolog +.type ppc64le_prolog, @function +ppc64le_prolog: + // Save LR + mflr 0 + std 0, 16(1) + + // Stack frame: 352 bytes (16-byte aligned) + // Layout (from new SP): + // 0: back chain + // 8: CR save + // 16: LR save (for functions called from dynarec) + // 24: TOC save + // 32: r14-r31 save area (144 bytes = 18*8) + // 176: jump_next slot (16 bytes = 2*8) + // 192: old xSPSave (8 bytes, for nested dynarec) + // 200: padding (8 bytes, for alignment) + // 208: vr20 save area (16 bytes, callee-saved VMX) + // 224: vr24 save area (16 bytes, callee-saved VMX, MMX0) + // 240: vr25 save area (16 bytes, callee-saved VMX, MMX1) + // 256: vr26 save area (16 bytes, callee-saved VMX, MMX2) + // 272: vr27 save area (16 bytes, callee-saved VMX, MMX3) + // 288: vr28 save area (16 bytes, callee-saved VMX, MMX4) + // 304: vr29 save area (16 bytes, callee-saved VMX, MMX5) + // 320: vr30 save area (16 bytes, callee-saved VMX, MMX6) + // 336: vr31 save area (16 bytes, callee-saved VMX, MMX7) + + stdu 1, -352(1) // allocate stack frame + + // Save callee-saved GPRs + std 14, 32(1) + std 15, 40(1) + std 16, 48(1) + std 17, 56(1) + std 18, 64(1) + std 19, 72(1) + std 20, 80(1) + std 21, 88(1) + std 22, 96(1) + std 23, 104(1) + std 24, 112(1) + std 25, 120(1) + std 26, 128(1) + std 27, 136(1) + std 28, 144(1) + std 29, 152(1) + std 30, 160(1) + std 31, 168(1) + + // Save CR + mfcr 0 + stw 0, 8(1) + + // Save callee-saved VMX registers + // vr20 (= vs52): scratch register index 28 + stxv 52, 208(1) + // vr24-vr31 (= vs56-vs63): MMX registers 0-7 + stxv 56, 224(1) + stxv 57, 240(1) + stxv 58, 256(1) + stxv 59, 272(1) + stxv 60, 288(1) + stxv 61, 304(1) + stxv 62, 320(1) + stxv 63, 336(1) + + // Save TOC (r2) + std 2, 24(1) + + // Setup: load x86 register state from emu struct + // emu struct layout: regs[0..15] = RAX,RCX,RDX,RBX,RSP,RBP,RSI,RDI,R8..R15 + // Each is 8 bytes, starting at offset 0 in x64emu_t + // r3 = emu pointer + + // Store emu pointer first (needed for xSPSave below) + mr Emu, 3 // Emu = r31 + + // Save the current frame SP to emu->xSPSave so the epilog can find it. + // r12 (xSavedSP) is caller-saved and gets clobbered by native calls, + // so we store the frame base in the emu struct instead (like RV64). + // Also save the old xSPSave value on the stack for nested dynarec calls. + ld 0, 808(Emu) // old xSPSave value + std 1, 808(Emu) // emu->xSPSave = current frame SP + std 0, 192(1) // save old xSPSave at frame offset 192 + + // Load x86 registers from emu into mapped PPC64LE registers + ld RAX, (8 * 0)(Emu) // RAX = r14 + ld RCX, (8 * 1)(Emu) // RCX = r15 + ld RDX, (8 * 2)(Emu) // RDX = r16 + ld RBX, (8 * 3)(Emu) // RBX = r17 + ld RSP, (8 * 4)(Emu) // RSP = r18 + ld RBP, (8 * 5)(Emu) // RBP = r19 + ld RSI, (8 * 6)(Emu) // RSI = r20 + ld RDI, (8 * 7)(Emu) // RDI = r21 + ld R8, (8 * 8)(Emu) // R8 = r22 + ld R9, (8 * 9)(Emu) // R9 = r23 + ld R10, (8 * 10)(Emu) // R10 = r24 + ld R11, (8 * 11)(Emu) // R11 = r25 + ld R12, (8 * 12)(Emu) // R12 = r26 + ld R13, (8 * 13)(Emu) // R13 = r27 + ld R14, (8 * 14)(Emu) // R14 = r28 + ld R15, (8 * 15)(Emu) // R15 = r29 + + // Load flags (at offset 16*8 = 128) + ld Flags, (8 * 16)(Emu) // Flags = r30 + + // Load RIP into r9 (xRIP) + ld RIP, (8 * 17)(Emu) // RIP = r9 + + // Push a jump_next slot on stack (2 doublewords of zeros) + li 0, 0 + std 0, 176(1) // jump_next[0] = 0 + std 0, 184(1) // jump_next[1] = 0 + + // Jump to dynarec block (address in r4) + mtctr 4 + bctr // branch to CTR (no link) +.size ppc64le_prolog, .-ppc64le_prolog diff --git a/src/emu/x64emu_private.h b/src/emu/x64emu_private.h index 1f4ed2d9f9..58ff56ad4e 100644 --- a/src/emu/x64emu_private.h +++ b/src/emu/x64emu_private.h @@ -74,7 +74,7 @@ typedef struct x64emu_s { x87control_t cw; uint16_t dummy_cw; // align... mmxcontrol_t mxcsr; - #ifdef RV64 // it would be better to use a dedicated register for this like arm64 xSavedSP, but we're running out of free registers. + #if defined(RV64) || defined(PPC64LE) // no spare callee-saved register for xSavedSP, store in emu struct instead uintptr_t xSPSave; // sp base value of current dynarec frame, used by call/ret optimization to reset stack when unmatch. #endif fpu_ld_t fpu_ld[8]; // for long double emulation / 80bits fld fst @@ -122,7 +122,7 @@ typedef struct x64emu_s { void* init_stack; // initial stack (owned or not) uint32_t size_stack; // stack size (owned or not) JUMPBUFF* jmpbuf; - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) uintptr_t old_savedsp; #endif diff --git a/src/emu/x64runf0.c b/src/emu/x64runf0.c index afeb994202..5e804498f4 100644 --- a/src/emu/x64runf0.c +++ b/src/emu/x64runf0.c @@ -791,7 +791,7 @@ uintptr_t RunF0(x64emu_t *emu, rex_t rex, uintptr_t addr) GETGD; #if defined(DYNAREC) && !defined(TEST_INTERPRETER) if (rex.w) { -#if defined(__riscv) || defined(__loongarch64) +#if defined(__riscv) || defined(__loongarch64) || defined(__powerpc64__) #if defined(__loongarch64) if (cpuext.scq) { do { diff --git a/src/include/box64context.h b/src/include/box64context.h index 841b62ba22..fb1a270587 100644 --- a/src/include/box64context.h +++ b/src/include/box64context.h @@ -181,7 +181,7 @@ typedef struct box64context_s { rbtree_t* db_sizes; int trace_dynarec; pthread_mutex_t mutex_lock; // this is for the Test interpreter - #if defined(__riscv) || defined(__loongarch64) + #if defined(__riscv) || defined(__loongarch64) || defined(__powerpc64__) uint32_t mutex_16b; #endif #endif diff --git a/src/include/hostext.h b/src/include/hostext.h index 0b36876397..325ac17d68 100644 --- a/src/include/hostext.h +++ b/src/include/hostext.h @@ -43,6 +43,12 @@ typedef union cpu_ext_s { uint64_t scq : 1; uint64_t frecipe : 1; uint64_t lasx : 1; +#elif defined(PPC64LE) + // POWER9 (ISA 3.0) is our minimum target + uint64_t crypto : 1; // PPC_FEATURE2_VEC_CRYPTO — vcipher/vncipher/vsbox/vpmsumb + uint64_t darn : 1; // PPC_FEATURE2_DARN — hardware random number + uint64_t isa31 : 1; // PPC_FEATURE2_ARCH_3_1 — POWER10 / ISA 3.1 + uint64_t mma : 1; // PPC_FEATURE2_MMA — Matrix-Multiply Assist (POWER10) #endif }; uint64_t x; diff --git a/src/include/sigtools.h b/src/include/sigtools.h index f2bfe52f6d..c604d03b43 100644 --- a/src/include/sigtools.h +++ b/src/include/sigtools.h @@ -17,6 +17,10 @@ #include "dynarec/rv64/rv64_mapping.h" #define CONTEXT_REG(P, X) (P)->uc_mcontext.__gregs[X] #define CONTEXT_PC(P) (P)->uc_mcontext.__gregs[REG_PC] +#elif defined(PPC64LE) +#include "dynarec/ppc64le/ppc64le_mapping.h" +#define CONTEXT_REG(P, X) (P)->uc_mcontext.gp_regs[X] +#define CONTEXT_PC(P) (P)->uc_mcontext.gp_regs[PT_NIP] #else #error Unsupported Architecture #endif //arch diff --git a/src/libtools/signal32.c b/src/libtools/signal32.c index a99c9c856a..c510d97df2 100644 --- a/src/libtools/signal32.c +++ b/src/libtools/signal32.c @@ -48,6 +48,10 @@ #include "dynarec/rv64/rv64_mapping.h" #define CONTEXT_REG(P, X) P->uc_mcontext.__gregs[X] #define CONTEXT_PC(P) P->uc_mcontext.__gregs[REG_PC] +#elif defined(PPC64LE) +#include "dynarec/ppc64le/ppc64le_mapping.h" +#define CONTEXT_REG(P, X) P->uc_mcontext.gp_regs[X] +#define CONTEXT_PC(P) P->uc_mcontext.gp_regs[PT_NIP] #else #error Unsupported Architecture #endif //arch @@ -754,7 +758,7 @@ int my_sigactionhandler_oldcode_32(x64emu_t* emu, int32_t sig, int simple, sigin if(Locks & is_dyndump_locked) CancelBlock64(1); #endif - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) emu->xSPSave = emu->old_savedsp; #endif #ifdef DYNAREC @@ -814,6 +818,8 @@ void my32_sigactionhandler(int32_t sig, siginfo_t* info, void * ucntx) void * pc = (void*)p->uc_mcontext.__pc; #elif defined(RV64) void * pc = (void*)p->uc_mcontext.__gregs[0]; + #elif defined(PPC64LE) + void * pc = (void*)p->uc_mcontext.gp_regs[PT_NIP]; #else #error Unsupported architecture #endif diff --git a/src/libtools/signals.c b/src/libtools/signals.c index 4cd5301f09..cd4b2e5ba5 100644 --- a/src/libtools/signals.c +++ b/src/libtools/signals.c @@ -559,7 +559,7 @@ int my_sigactionhandler_oldcode_64(x64emu_t* emu, int32_t sig, int simple, sigin if(Locks & is_dyndump_locked) CancelBlock64(1); #endif - #ifdef RV64 + #if defined(RV64) || defined(PPC64LE) emu->xSPSave = emu->old_savedsp; #endif #ifdef DYNAREC @@ -882,6 +882,9 @@ void my_box64signalhandler(int32_t sig, siginfo_t* info, void * ucntx) emu->test.clean = 0; // use "3" to regen a dynablock at current pc (else it will first do an interp run) dynablock_leave_runtime(db); + #if defined(RV64) || defined(PPC64LE) + emu->xSPSave = emu->old_savedsp; + #endif #ifdef ANDROID siglongjmp(*(JUMPBUFF*)emu->jmpbuf, 3); #else @@ -954,6 +957,9 @@ void my_box64signalhandler(int32_t sig, siginfo_t* info, void * ucntx) emu->test.clean = 0; // will restore unblocked Signal flags too dynablock_leave_runtime(db); + #if defined(RV64) || defined(PPC64LE) + emu->xSPSave = emu->old_savedsp; + #endif #ifdef ANDROID siglongjmp(*(JUMPBUFF*)emu->jmpbuf, 2); #else diff --git a/src/libtools/sigtools.c b/src/libtools/sigtools.c index d9c3f620eb..62a3a174ff 100644 --- a/src/libtools/sigtools.c +++ b/src/libtools/sigtools.c @@ -630,6 +630,7 @@ void adjustregs(x64emu_t* emu, void* pc) } #elif defined(LA64) #elif defined(RV64) +#elif defined(PPC64LE) #else #error Unsupported architecture #endif diff --git a/src/os/hostext_common.c b/src/os/hostext_common.c index 6e526d6aff..845dd1e1d0 100644 --- a/src/os/hostext_common.c +++ b/src/os/hostext_common.c @@ -65,6 +65,17 @@ void PrintHostCpuFeatures(void) // if(cpuext.xtheadfmv) printf_log_prefix(0, LOG_INFO, " xtheadfmv"); if (cpuext.xtheadvector) printf_log_prefix(0, LOG_INFO, "_xthvector"); printf_log_prefix(0, LOG_INFO, "\n"); +#elif defined(PPC64LE) + printf_log(LOG_INFO, "Dynarec for PPC64LE (POWER9+, ISA 3.0)"); + if(cpuext.crypto) + printf_log_prefix(0, LOG_INFO, " CRYPTO"); + if(cpuext.darn) + printf_log_prefix(0, LOG_INFO, " DARN"); + if(cpuext.isa31) + printf_log_prefix(0, LOG_INFO, " ISA3.1"); + if(cpuext.mma) + printf_log_prefix(0, LOG_INFO, " MMA"); + printf_log_prefix(0, LOG_INFO, "\n"); #endif } #endif \ No newline at end of file diff --git a/src/os/hostext_linux.c b/src/os/hostext_linux.c index db2bd38da5..3194e030e6 100644 --- a/src/os/hostext_linux.c +++ b/src/os/hostext_linux.c @@ -9,6 +9,11 @@ #include #endif +#ifdef PPC64LE +#include +#include +#endif + #ifdef RV64 #include #include @@ -255,6 +260,33 @@ int DetectHostCpuFeatures(void) } } } +#elif defined(PPC64LE) + unsigned long hwcap = real_getauxval(AT_HWCAP); + unsigned long hwcap2 = real_getauxval(AT_HWCAP2); + // Minimum: ALTIVEC + VSX + ISA 3.0 (POWER9) + if (!(hwcap & PPC_FEATURE_HAS_ALTIVEC)) return 0; + if (!(hwcap & PPC_FEATURE_HAS_VSX)) return 0; + if (!(hwcap2 & PPC_FEATURE2_ARCH_3_00)) return 0; + if (p == NULL || p[0] != '1') { + if (hwcap2 & PPC_FEATURE2_VEC_CRYPTO) + cpuext.crypto = 1; + if (hwcap2 & PPC_FEATURE2_DARN) + cpuext.darn = 1; + if (hwcap2 & PPC_FEATURE2_ARCH_3_1) + cpuext.isa31 = 1; + if (hwcap2 & PPC_FEATURE2_MMA) + cpuext.mma = 1; + if (p) { + p = strtok(p, ","); + while (p) { + if (!strcasecmp(p, "crypto")) cpuext.crypto = 0; + if (!strcasecmp(p, "darn")) cpuext.darn = 0; + if (!strcasecmp(p, "isa31")) cpuext.isa31 = 0; + if (!strcasecmp(p, "mma")) cpuext.mma = 0; + p = strtok(NULL, ","); + } + } + } #endif return 1; } diff --git a/src/tools/env.c b/src/tools/env.c index c159a148c7..e02c49b8ab 100644 --- a/src/tools/env.c +++ b/src/tools/env.c @@ -326,6 +326,8 @@ static void freeEnv(box64env_t* env) #define ENV_ARCH "rv64" #elif defined(LA64) #define ENV_ARCH "la64" +#elif defined(PPC64LE) +#define ENV_ARCH "ppc64le" #elif defined(X86_64) #define ENV_ARCH "x86_64" #else @@ -901,6 +903,8 @@ const char* GetDynacacheFolder(mapping_t* mapping) #define ARCH_VERSION SET_VERSION(0, 0, 4) #elif defined(LA64) #define ARCH_VERSION SET_VERSION(0, 0, 5) +#elif defined(PPC64LE) +#define ARCH_VERSION SET_VERSION(0, 0, 1) #else #error meh! #endif diff --git a/src/wrapped/generated/functions_list.txt b/src/wrapped/generated/functions_list.txt index a6fcefb485..5f6c4f8ab0 100644 --- a/src/wrapped/generated/functions_list.txt +++ b/src/wrapped/generated/functions_list.txt @@ -4075,6 +4075,8 @@ #() iFEpuvvppp -> iFEpuppp wrappedalure: wrappedalut: +wrappedandroidshmem: +wrappedandroidsupport: wrappedanl: wrappedatk: - vFp: @@ -5198,6 +5200,7 @@ wrappedgtkx112: - pFpipppppppi: - gtk_toolbar_insert_element wrappedharfbuzzsubset: +wrappediconv: wrappedicui18n64: wrappedicui18n66: wrappedicui18n67: @@ -7100,6 +7103,7 @@ wrappedtcmallocminimal: - pFpLiiil: - mmap - mmap64 +wrappedtermuxexec: wrappedudev0: wrappedudev1: - vFpp: diff --git a/src/wrapped/wrappedldlinux.c b/src/wrapped/wrappedldlinux.c index 64a178c1d4..af52bb61fe 100644 --- a/src/wrapped/wrappedldlinux.c +++ b/src/wrapped/wrappedldlinux.c @@ -41,7 +41,9 @@ void stSetup(box64context_t* context) #ifdef STATICBUILD #include extern void* __libc_enable_secure; +#ifndef PPC64LE extern void* __stack_chk_guard; +#endif //extern void* __pointer_chk_guard; //extern void* _rtld_global; //extern void* _rtld_global_ro; diff --git a/src/wrapped/wrappedldlinux_private.h b/src/wrapped/wrappedldlinux_private.h index 6e2b6fe592..84a13f1381 100644 --- a/src/wrapped/wrappedldlinux_private.h +++ b/src/wrapped/wrappedldlinux_private.h @@ -26,5 +26,9 @@ DATA(__pointer_chk_guard, sizeof(void*)) DATA(_rtld_global, sizeof(void*)) DATA(_rtld_global_ro, sizeof(void*)) #endif +#ifndef PPC64LE DATA(__stack_chk_guard, sizeof(void*)) +#else +//DATA(__stack_chk_guard, sizeof(void*)) +#endif GOM(__tls_get_addr, pFEp) diff --git a/src/wrapped/wrappedlibc.c b/src/wrapped/wrappedlibc.c index cae35ad52f..fb774611d6 100644 --- a/src/wrapped/wrappedlibc.c +++ b/src/wrapped/wrappedlibc.c @@ -794,6 +794,10 @@ int of_unconvert(int a) if(!O_LARGEFILE) { if((a&(0400000))==(0400000)) {a&=~(0400000); b|=(X86_O_LARGEFILE);} } + #elif defined(PPC64LE) + if(!O_LARGEFILE) { + if((a&(0200000))==(0200000)) {a&=~(0200000); b|=(X86_O_LARGEFILE);} + } #else if(!O_LARGEFILE) missing |= X86_O_LARGEFILE; #endif diff --git a/src/wrapped32/wrappedlibc.c b/src/wrapped32/wrappedlibc.c index 977d7f13da..3908e759d1 100755 --- a/src/wrapped32/wrappedlibc.c +++ b/src/wrapped32/wrappedlibc.c @@ -830,6 +830,10 @@ int of_unconvert32(int a) if(!O_LARGEFILE) { if((a&(0400000))==(0400000)) {a&=~(0400000); b|=(X86_O_LARGEFILE);} } + #elif defined(PPC64LE) + if(!O_LARGEFILE) { + if((a&(0200000))==(0200000)) {a&=~(0200000); b|=(X86_O_LARGEFILE);} + } #else if(!O_LARGEFILE) missing |= X86_O_LARGEFILE; #endif