diff --git a/src/Makefile b/src/Makefile index 71ca028cd..a5bac2404 100644 --- a/src/Makefile +++ b/src/Makefile @@ -37,6 +37,8 @@ CC= $(DEFAULT_CC) # unwinding are not affected -- the assembler part has frame unwind # information and GCC emits it where needed (x64) or with -g (see CCDEBUG). CCOPT= -O2 -fomit-frame-pointer +# only apply SSE4.2 to target or else host minilua will fail on non-sse4.2 hardware +# TARGET_CFLAGS=-msse4.2 # Use this if you want to generate a smaller binary (but it's slower): #CCOPT= -Os -fomit-frame-pointer # Note: it's no longer recommended to use -O3 with GCC 4.x. diff --git a/src/lib_jit.c b/src/lib_jit.c index 8768a20cc..b6efa4147 100644 --- a/src/lib_jit.c +++ b/src/lib_jit.c @@ -659,114 +659,15 @@ JIT_PARAMDEF(JIT_PARAMINIT) }; #endif -#if LJ_TARGET_ARM && LJ_TARGET_LINUX -#include -#endif - -/* Arch-dependent CPU detection. */ -static uint32_t jit_cpudetect(lua_State *L) -{ - uint32_t flags = 0; -#if LJ_TARGET_X86ORX64 - uint32_t vendor[4]; - uint32_t features[4]; - if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { -#if !LJ_HASJIT -#define JIT_F_SSE2 2 -#endif - flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; -#if LJ_HASJIT - flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; - flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; - if (vendor[2] == 0x6c65746e) { /* Intel. */ - if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ - flags |= JIT_F_LEA_AGU; - } else if (vendor[2] == 0x444d4163) { /* AMD. */ - uint32_t fam = (features[0] & 0x0ff00f00); - if (fam >= 0x00000f00) /* K8, K10. */ - flags |= JIT_F_PREFER_IMUL; - } - if (vendor[0] >= 7) { - uint32_t xfeatures[4]; - lj_vm_cpuid(7, xfeatures); - flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; - } -#endif - } - /* Check for required instruction set support on x86 (unnecessary on x64). */ -#if LJ_TARGET_X86 - if (!(flags & JIT_F_SSE2)) - luaL_error(L, "CPU with SSE2 required"); -#endif -#elif LJ_TARGET_ARM -#if LJ_HASJIT - int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ -#if LJ_TARGET_LINUX - if (ver < 70) { /* Runtime ARM CPU detection. */ - struct utsname ut; - uname(&ut); - if (strncmp(ut.machine, "armv", 4) == 0) { - if (ut.machine[4] >= '7') - ver = 70; - else if (ut.machine[4] == '6') - ver = 60; - } - } -#endif - flags |= ver >= 70 ? JIT_F_ARMV7 : - ver >= 61 ? JIT_F_ARMV6T2_ : - ver >= 60 ? JIT_F_ARMV6_ : 0; - flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; -#endif -#elif LJ_TARGET_ARM64 - /* No optional CPU features to detect (for now). */ -#elif LJ_TARGET_PPC -#if LJ_HASJIT -#if LJ_ARCH_SQRT - flags |= JIT_F_SQRT; -#endif -#if LJ_ARCH_ROUND - flags |= JIT_F_ROUND; -#endif -#endif -#elif LJ_TARGET_MIPS -#if LJ_HASJIT - /* Compile-time MIPS CPU detection. */ -#if LJ_ARCH_VERSION >= 20 - flags |= JIT_F_MIPSXXR2; -#endif - /* Runtime MIPS CPU detection. */ -#if defined(__GNUC__) - if (!(flags & JIT_F_MIPSXXR2)) { - int x; -#ifdef __mips16 - x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ -#else - /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ - __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); -#endif - if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ - } -#endif -#endif -#else -#error "Missing CPU detection for this architecture" -#endif - UNUSED(L); - return flags; -} /* Initialize JIT compiler. */ static void jit_init(lua_State *L) { - uint32_t flags = jit_cpudetect(L); #if LJ_HASJIT jit_State *J = L2J(L); - J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT; + J->flags = J->flags | JIT_F_ON | JIT_F_OPT_DEFAULT; memcpy(J->param, jit_param_default, sizeof(J->param)); lj_dispatch_update(G(L)); -#else - UNUSED(flags); #endif } diff --git a/src/lj_jit.h b/src/lj_jit.h index 2fea3859b..ef28dacc3 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -20,6 +20,7 @@ #define JIT_F_PREFER_IMUL 0x00000080 #define JIT_F_LEA_AGU 0x00000100 #define JIT_F_BMI2 0x00000200 +#define JIT_F_SSE4_2 0x00000400 /* Names for the CPU-specific flags. Must match the order above. */ #define JIT_F_CPU_FIRST JIT_F_SSE2 diff --git a/src/lj_state.c b/src/lj_state.c index 632dd07e5..04001dd14 100644 --- a/src/lj_state.c +++ b/src/lj_state.c @@ -180,6 +180,106 @@ static void close_state(lua_State *L) g->allocf(g->allocd, G2GG(g), sizeof(GG_State), 0); } +#if LJ_TARGET_ARM && LJ_TARGET_LINUX +#include +#endif + +/* Arch-dependent CPU detection. */ +static uint32_t _cpudetect(lua_State *L) +{ + uint32_t flags = 0; +#if LJ_TARGET_X86ORX64 + uint32_t vendor[4]; + uint32_t features[4]; + if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) { +#if !LJ_HASJIT +#define JIT_F_SSE2 2 +#endif + flags |= ((features[3] >> 26)&1) * JIT_F_SSE2; +#if LJ_HASJIT + flags |= ((features[2] >> 0)&1) * JIT_F_SSE3; + flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1; + flags |= ((features[2] >> 20)&1) * JIT_F_SSE4_2; + if (vendor[2] == 0x6c65746e) { /* Intel. */ + if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */ + flags |= JIT_F_LEA_AGU; + } else if (vendor[2] == 0x444d4163) { /* AMD. */ + uint32_t fam = (features[0] & 0x0ff00f00); + if (fam >= 0x00000f00) /* K8, K10. */ + flags |= JIT_F_PREFER_IMUL; + } + if (vendor[0] >= 7) { + uint32_t xfeatures[4]; + lj_vm_cpuid(7, xfeatures); + flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2; + } +#endif + } + /* Check for required instruction set support on x86 (unnecessary on x64). */ +#if LJ_TARGET_X86 + if (!(flags & JIT_F_SSE2)) + luaL_error(L, "CPU with SSE2 required"); +#endif +#elif LJ_TARGET_ARM +#if LJ_HASJIT + int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */ +#if LJ_TARGET_LINUX + if (ver < 70) { /* Runtime ARM CPU detection. */ + struct utsname ut; + uname(&ut); + if (strncmp(ut.machine, "armv", 4) == 0) { + if (ut.machine[4] >= '7') + ver = 70; + else if (ut.machine[4] == '6') + ver = 60; + } + } +#endif + flags |= ver >= 70 ? JIT_F_ARMV7 : + ver >= 61 ? JIT_F_ARMV6T2_ : + ver >= 60 ? JIT_F_ARMV6_ : 0; + flags |= LJ_ARCH_HASFPU == 0 ? 0 : ver >= 70 ? JIT_F_VFPV3 : JIT_F_VFPV2; +#endif +#elif LJ_TARGET_ARM64 + /* No optional CPU features to detect (for now). */ +#elif LJ_TARGET_PPC +#if LJ_HASJIT +#if LJ_ARCH_SQRT + flags |= JIT_F_SQRT; +#endif +#if LJ_ARCH_ROUND + flags |= JIT_F_ROUND; +#endif +#endif +#elif LJ_TARGET_MIPS +#if LJ_HASJIT + /* Compile-time MIPS CPU detection. */ +#if LJ_ARCH_VERSION >= 20 + flags |= JIT_F_MIPSXXR2; +#endif + /* Runtime MIPS CPU detection. */ +#if defined(__GNUC__) + if (!(flags & JIT_F_MIPSXXR2)) { + int x; +#ifdef __mips16 + x = 0; /* Runtime detection is difficult. Ensure optimal -march flags. */ +#else + /* On MIPS32R1 rotr is treated as srl. rotr r2,r2,1 -> srl r2,r2,1. */ + __asm__("li $2, 1\n\t.long 0x00221042\n\tmove %0, $2" : "=r"(x) : : "$2"); +#endif + if (x) flags |= JIT_F_MIPSXXR2; /* Either 0x80000000 (R2) or 0 (R1). */ + } +#endif +#endif +#else +#error "Missing CPU detection for this architecture" +#endif + UNUSED(L); + return flags; +} + +extern void x64_init_random(); + #if LJ_64 && !LJ_GC64 && !(defined(LUAJIT_USE_VALGRIND) && defined(LUAJIT_USE_SYSMALLOC)) lua_State *lj_state_newstate(lua_Alloc f, void *ud) #else @@ -188,7 +288,20 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) { GG_State *GG = (GG_State *)f(ud, NULL, 0, sizeof(GG_State)); lua_State *L = &GG->L; + + /* detect cpu features as early as possible */ + /* and init random table if we have SSE4.2 support */ + uint32_t flags = _cpudetect(L); + +#if defined(__SSE4_2__) + if (flags & JIT_F_SSE4_2) + { + x64_init_random(); + } +#endif + global_State *g = &GG->g; + if (GG == NULL || !checkptrGC(GG)) return NULL; memset(GG, 0, sizeof(GG_State)); L->gct = ~LJ_TTHREAD; @@ -219,6 +332,9 @@ LUA_API lua_State *lua_newstate(lua_Alloc f, void *ud) g->gc.stepmul = LUAI_GCMUL; lj_dispatch_init((GG_State *)L); L->status = LUA_ERRERR+1; /* Avoid touching the stack upon memory error. */ + + G2J(g)->flags = flags; /* copy detected flags to jit state */ + if (lj_vm_cpcall(L, NULL, NULL, cpluaopen) != 0) { /* Memory allocation error: free partial state. */ close_state(L); diff --git a/src/lj_str.c b/src/lj_str.c index b9469ca00..b72c98e7d 100644 --- a/src/lj_str.c +++ b/src/lj_str.c @@ -11,6 +11,7 @@ #include "lj_err.h" #include "lj_str.h" #include "lj_char.h" +#include "lj_dispatch.h" /* for G2J */ /* -- String helpers ------------------------------------------------------ */ @@ -165,12 +166,6 @@ lj_str_indep_hash(GCstr *str) { #include "x64/src/lj_str_hash_x64.h" -#if defined(LJ_ARCH_STR_HASH) -#define LJ_STR_HASH LJ_ARCH_STR_HASH -#else -#define LJ_STR_HASH lj_str_original_hash -#endif - /* Intern a string and return string object. */ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) { @@ -187,7 +182,19 @@ GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) return &g->strempty; } - h = LJ_STR_HASH(str, lenx); + /* switch between sse and non-sse hash branches */ +#if defined(__SSE4_2__) + if ((G2J(g)->flags & JIT_F_SSE4_2)) + { + h = lj_str_sse_hash(str, lenx); + } + else + { + h = lj_str_original_hash(str, lenx); + } +#else + h = lj_str_original_hash(str, lenx); +#endif /* Check if the string has already been interned. */ o = gcref(g->strhash[h & g->strmask]); diff --git a/src/x64/src/lj_str_hash_x64.h b/src/x64/src/lj_str_hash_x64.h index 063f631c7..058d2038e 100644 --- a/src/x64/src/lj_str_hash_x64.h +++ b/src/x64/src/lj_str_hash_x64.h @@ -144,7 +144,7 @@ static LJ_AINLINE uint32_t log2_floor(uint32_t n) /* This function is to populate `random_pos` such that random_pos[i][*] * contains random value in the range of [2**i, 2**(i+1)). */ -static void x64_init_random(void) +void x64_init_random(void) { int i, seed, rml; @@ -185,11 +185,6 @@ static void x64_init_random(void) } #undef POW2_MASK -void __attribute__((constructor)) x64_init_random_constructor() -{ - x64_init_random(); -} - /* Return a pre-computed random number in the range of [1**chunk_sz_order, * 1**(chunk_sz_order+1)). It is "unsafe" in the sense that the return value * may be greater than chunk-size; it is up to the caller to make sure @@ -246,7 +241,7 @@ static LJ_NOINLINE uint32_t lj_str_hash_128_above(const char* str, } /* NOTE: the "len" should not be zero */ -static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) +static LJ_AINLINE uint32_t lj_str_sse_hash(const char* str, size_t len) { if (len < 128) { if (len >= 16) { /* [16, 128) */ @@ -264,8 +259,5 @@ static LJ_AINLINE uint32_t lj_str_hash(const char* str, size_t len) return lj_str_hash_128_above(str, len); } -#define LJ_ARCH_STR_HASH lj_str_hash -#else -#undef LJ_ARCH_STR_HASH #endif #endif /*_LJ_STR_HASH_X64_H_*/ diff --git a/src/x64/test/unit_test.sh b/src/x64/test/unit_test.sh old mode 100644 new mode 100755