diff --git a/ext/json/fbuffer/fbuffer.h b/ext/json/fbuffer/fbuffer.h index 7d57a87b14ff5e..ecba61465e46c0 100644 --- a/ext/json/fbuffer/fbuffer.h +++ b/ext/json/fbuffer/fbuffer.h @@ -1,55 +1,9 @@ #ifndef _FBUFFER_H_ #define _FBUFFER_H_ -#include "ruby.h" -#include "ruby/encoding.h" +#include "../json.h" #include "../vendor/jeaiii-ltoa.h" -/* shims */ -/* This is the fallback definition from Ruby 3.4 */ - -#ifndef RBIMPL_STDBOOL_H -#if defined(__cplusplus) -# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L) -# include -# endif -#elif defined(HAVE_STDBOOL_H) -# include -#elif !defined(HAVE__BOOL) -typedef unsigned char _Bool; -# define bool _Bool -# define true ((_Bool)+1) -# define false ((_Bool)+0) -# define __bool_true_false_are_defined -#endif -#endif - -#ifndef NOINLINE -#if defined(__has_attribute) && __has_attribute(noinline) -#define NOINLINE() __attribute__((noinline)) -#else -#define NOINLINE() -#endif -#endif - -#ifndef RB_UNLIKELY -#define RB_UNLIKELY(expr) expr -#endif - -#ifndef RB_LIKELY -#define RB_LIKELY(expr) expr -#endif - -#ifndef MAYBE_UNUSED -# define MAYBE_UNUSED(x) x -#endif - -#ifdef RUBY_DEBUG -#ifndef JSON_DEBUG -#define JSON_DEBUG RUBY_DEBUG -#endif -#endif - enum fbuffer_type { FBUFFER_HEAP_ALLOCATED = 0, FBUFFER_STACK_ALLOCATED = 1, @@ -290,4 +244,4 @@ static VALUE fbuffer_finalize(FBuffer *fb) } } -#endif +#endif // _FBUFFER_H_ diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c index 024a8572726098..56aa636ef87cc0 100644 --- a/ext/json/generator/generator.c +++ b/ext/json/generator/generator.c @@ -1,4 +1,4 @@ -#include "ruby.h" +#include "../json.h" #include "../fbuffer/fbuffer.h" #include "../vendor/fpconv.c" @@ -36,10 +36,6 @@ typedef struct JSON_Generator_StateStruct { bool strict; } JSON_Generator_State; -#ifndef RB_UNLIKELY -#define RB_UNLIKELY(cond) (cond) -#endif - static VALUE mJSON, cState, cFragment, eGeneratorError, eNestingError, Encoding_UTF_8; static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode; @@ -85,10 +81,7 @@ static void generate_json_fragment(FBuffer *buffer, struct generate_json_data *d static int usascii_encindex, utf8_encindex, binary_encindex; -#ifdef RBIMPL_ATTR_NORETURN -RBIMPL_ATTR_NORETURN() -#endif -static void raise_generator_error_str(VALUE invalid_object, VALUE str) +NORETURN(static void) raise_generator_error_str(VALUE invalid_object, VALUE str) { rb_enc_associate_index(str, utf8_encindex); VALUE exc = rb_exc_new_str(eGeneratorError, str); @@ -96,13 +89,10 @@ static void raise_generator_error_str(VALUE invalid_object, VALUE str) rb_exc_raise(exc); } -#ifdef RBIMPL_ATTR_NORETURN -RBIMPL_ATTR_NORETURN() -#endif #ifdef RBIMPL_ATTR_FORMAT RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 2, 3) #endif -static void raise_generator_error(VALUE invalid_object, const char *fmt, ...) +NORETURN(static void) raise_generator_error(VALUE invalid_object, const char *fmt, ...) { va_list args; va_start(args, fmt); @@ -137,13 +127,7 @@ typedef struct _search_state { #endif /* HAVE_SIMD */ } search_state; -#if (defined(__GNUC__ ) || defined(__clang__)) -#define FORCE_INLINE __attribute__((always_inline)) -#else -#define FORCE_INLINE -#endif - -static inline FORCE_INLINE void search_flush(search_state *search) +static ALWAYS_INLINE() void search_flush(search_state *search) { // Do not remove this conditional without profiling, specifically escape-heavy text. // escape_UTF8_char_basic will advance search->ptr and search->cursor (effectively a search_flush). @@ -186,7 +170,7 @@ static inline unsigned char search_escape_basic(search_state *search) return 0; } -static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) +static ALWAYS_INLINE() void escape_UTF8_char_basic(search_state *search) { const unsigned char ch = (unsigned char)*search->ptr; switch (ch) { @@ -273,7 +257,7 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) #ifdef HAVE_SIMD -static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) +static ALWAYS_INLINE() char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) { // Flush the buffer so everything up until the last 'len' characters are unflushed. search_flush(search); @@ -296,7 +280,7 @@ static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsi #ifdef HAVE_SIMD_NEON -static inline FORCE_INLINE unsigned char neon_next_match(search_state *search) +static ALWAYS_INLINE() unsigned char neon_next_match(search_state *search) { uint64_t mask = search->matches_mask; uint32_t index = trailing_zeros64(mask) >> 2; @@ -410,7 +394,7 @@ static inline unsigned char search_escape_basic_neon(search_state *search) #ifdef HAVE_SIMD_SSE2 -static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) +static ALWAYS_INLINE() unsigned char sse2_next_match(search_state *search) { int mask = search->matches_mask; int index = trailing_zeros(mask); @@ -434,7 +418,7 @@ static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) #define TARGET_SSE2 #endif -static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search) +static inline TARGET_SSE2 ALWAYS_INLINE() unsigned char search_escape_basic_sse2(search_state *search) { if (RB_UNLIKELY(search->has_matches)) { // There are more matches if search->matches_mask > 0. diff --git a/ext/json/json.h b/ext/json/json.h new file mode 100644 index 00000000000000..873440527dec60 --- /dev/null +++ b/ext/json/json.h @@ -0,0 +1,85 @@ +#ifndef _JSON_H_ +#define _JSON_H_ + +#include "ruby.h" +#include "ruby/encoding.h" + +#if defined(RUBY_DEBUG) && RUBY_DEBUG +# define JSON_ASSERT RUBY_ASSERT +#else +# ifdef JSON_DEBUG +# include +# define JSON_ASSERT(x) assert(x) +# else +# define JSON_ASSERT(x) +# endif +#endif + +/* shims */ + +#if SIZEOF_UINT64_T == SIZEOF_LONG_LONG +# define INT64T2NUM(x) LL2NUM(x) +# define UINT64T2NUM(x) ULL2NUM(x) +#elif SIZEOF_UINT64_T == SIZEOF_LONG +# define INT64T2NUM(x) LONG2NUM(x) +# define UINT64T2NUM(x) ULONG2NUM(x) +#else +# error No uint64_t conversion +#endif + +/* This is the fallback definition from Ruby 3.4 */ +#ifndef RBIMPL_STDBOOL_H +#if defined(__cplusplus) +# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L) +# include +# endif +#elif defined(HAVE_STDBOOL_H) +# include +#elif !defined(HAVE__BOOL) +typedef unsigned char _Bool; +# define bool _Bool +# define true ((_Bool)+1) +# define false ((_Bool)+0) +# define __bool_true_false_are_defined +#endif +#endif + +#ifndef NORETURN +#define NORETURN(x) x +#endif + +#ifndef NOINLINE +#if defined(__has_attribute) && __has_attribute(noinline) +#define NOINLINE(x) __attribute__((noinline)) x +#else +#define NOINLINE(x) x +#endif +#endif + +#ifndef ALWAYS_INLINE +#if defined(__has_attribute) && __has_attribute(always_inline) +#define ALWAYS_INLINE(x) inline __attribute__((always_inline)) x +#else +#define ALWAYS_INLINE(x) inline x +#endif +#endif + +#ifndef RB_UNLIKELY +#define RB_UNLIKELY(expr) expr +#endif + +#ifndef RB_LIKELY +#define RB_LIKELY(expr) expr +#endif + +#ifndef MAYBE_UNUSED +# define MAYBE_UNUSED(x) x +#endif + +#ifdef RUBY_DEBUG +#ifndef JSON_DEBUG +#define JSON_DEBUG RUBY_DEBUG +#endif +#endif + +#endif // _JSON_H_ \ No newline at end of file diff --git a/ext/json/parser/extconf.rb b/ext/json/parser/extconf.rb index dc1c8952c6e6cd..cda385767c3cc7 100644 --- a/ext/json/parser/extconf.rb +++ b/ext/json/parser/extconf.rb @@ -3,6 +3,7 @@ $defs << "-DJSON_DEBUG" if ENV["JSON_DEBUG"] have_func("rb_enc_interned_str", "ruby/encoding.h") # RUBY_VERSION >= 3.0 +have_func("rb_str_to_interned_str", "ruby.h") # RUBY_VERSION >= 3.0 have_func("rb_hash_new_capa", "ruby.h") # RUBY_VERSION >= 3.2 have_func("rb_hash_bulk_insert", "ruby.h") # Missing on TruffleRuby have_func("strnlen", "string.h") # Missing on Solaris 10 diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c index 8206716d705cb8..25eeb89e773f1b 100644 --- a/ext/json/parser/parser.c +++ b/ext/json/parser/parser.c @@ -1,46 +1,7 @@ -#include "ruby.h" -#include "ruby/encoding.h" +#include "../json.h" #include "../vendor/ryu.h" - -/* shims */ -/* This is the fallback definition from Ruby 3.4 */ - -#ifndef RBIMPL_STDBOOL_H -#if defined(__cplusplus) -# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L) -# include -# endif -#elif defined(HAVE_STDBOOL_H) -# include -#elif !defined(HAVE__BOOL) -typedef unsigned char _Bool; -# define bool _Bool -# define true ((_Bool)+1) -# define false ((_Bool)+0) -# define __bool_true_false_are_defined -#endif -#endif - -#if SIZEOF_UINT64_T == SIZEOF_LONG_LONG -# define INT64T2NUM(x) LL2NUM(x) -# define UINT64T2NUM(x) ULL2NUM(x) -#elif SIZEOF_UINT64_T == SIZEOF_LONG -# define INT64T2NUM(x) LONG2NUM(x) -# define UINT64T2NUM(x) ULONG2NUM(x) -#else -# error No uint64_t conversion -#endif - #include "../simd/simd.h" -#ifndef RB_UNLIKELY -#define RB_UNLIKELY(expr) expr -#endif - -#ifndef RB_LIKELY -#define RB_LIKELY(expr) expr -#endif - static VALUE mJSON, eNestingError, Encoding_UTF_8; static VALUE CNaN, CInfinity, CMinusInfinity; @@ -55,7 +16,7 @@ static int utf8_encindex; #ifndef HAVE_RB_HASH_BULK_INSERT // For TruffleRuby -void +static void rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) { long index = 0; @@ -72,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) #define rb_hash_new_capa(n) rb_hash_new() #endif +#ifndef HAVE_RB_STR_TO_INTERNED_STR +static VALUE rb_str_to_interned_str(VALUE str) +{ + return rb_funcall(rb_str_freeze(str), i_uminus, 0); +} +#endif /* name cache */ @@ -129,104 +96,54 @@ static inline int rstring_cache_cmp(const char *str, const long length, VALUE rs static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length) { - if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) { - // Common names aren't likely to be very long. So we just don't - // cache names above an arbitrary threshold. - return Qfalse; - } - - if (RB_UNLIKELY(!rb_isalpha((unsigned char)str[0]))) { - // Simple heuristic, if the first character isn't a letter, - // we're much less likely to see this string again. - // We mostly want to cache strings that are likely to be repeated. - return Qfalse; - } - int low = 0; int high = cache->length - 1; - int mid = 0; - int last_cmp = 0; while (low <= high) { - mid = (high + low) >> 1; + int mid = (high + low) >> 1; VALUE entry = cache->entries[mid]; - last_cmp = rstring_cache_cmp(str, length, entry); + int cmp = rstring_cache_cmp(str, length, entry); - if (last_cmp == 0) { + if (cmp == 0) { return entry; - } else if (last_cmp > 0) { + } else if (cmp > 0) { low = mid + 1; } else { high = mid - 1; } } - if (RB_UNLIKELY(memchr(str, '\\', length))) { - // We assume the overwhelming majority of names don't need to be escaped. - // But if they do, we have to fallback to the slow path. - return Qfalse; - } - VALUE rstring = build_interned_string(str, length); if (cache->length < JSON_RVALUE_CACHE_CAPA) { - if (last_cmp > 0) { - mid += 1; - } - - rvalue_cache_insert_at(cache, mid, rstring); + rvalue_cache_insert_at(cache, low, rstring); } return rstring; } static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length) { - if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) { - // Common names aren't likely to be very long. So we just don't - // cache names above an arbitrary threshold. - return Qfalse; - } - - if (RB_UNLIKELY(!rb_isalpha((unsigned char)str[0]))) { - // Simple heuristic, if the first character isn't a letter, - // we're much less likely to see this string again. - // We mostly want to cache strings that are likely to be repeated. - return Qfalse; - } - int low = 0; int high = cache->length - 1; - int mid = 0; - int last_cmp = 0; while (low <= high) { - mid = (high + low) >> 1; + int mid = (high + low) >> 1; VALUE entry = cache->entries[mid]; - last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry)); + int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry)); - if (last_cmp == 0) { + if (cmp == 0) { return entry; - } else if (last_cmp > 0) { + } else if (cmp > 0) { low = mid + 1; } else { high = mid - 1; } } - if (RB_UNLIKELY(memchr(str, '\\', length))) { - // We assume the overwhelming majority of names don't need to be escaped. - // But if they do, we have to fallback to the slow path. - return Qfalse; - } - VALUE rsymbol = build_symbol(str, length); if (cache->length < JSON_RVALUE_CACHE_CAPA) { - if (last_cmp > 0) { - mid += 1; - } - - rvalue_cache_insert_at(cache, mid, rsymbol); + rvalue_cache_insert_at(cache, low, rsymbol); } return rsymbol; } @@ -663,11 +580,20 @@ static inline VALUE build_string(const char *start, const char *end, bool intern return result; } +static inline bool json_string_cacheable_p(const char *string, size_t length) +{ + // We mostly want to cache strings that are likely to be repeated. + // Simple heuristics: + // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold. + // - If the first character isn't a letter, we're much less likely to see this string again. + return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]); +} + static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize) { size_t bufferSize = stringEnd - string; - if (is_name && state->in_array) { + if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) { VALUE cached_key; if (RB_UNLIKELY(symbolize)) { cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize); @@ -691,19 +617,6 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c int unescape_len; char buf[4]; - if (is_name && state->in_array) { - VALUE cached_key; - if (RB_UNLIKELY(symbolize)) { - cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize); - } else { - cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize); - } - - if (RB_LIKELY(cached_key)) { - return cached_key; - } - } - VALUE result = rb_str_buf_new(bufferSize); rb_enc_associate_index(result, utf8_encindex); buffer = RSTRING_PTR(result); @@ -796,7 +709,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c if (symbolize) { result = rb_str_intern(result); } else if (intern) { - result = rb_funcall(rb_str_freeze(result), i_uminus, 0); + result = rb_str_to_interned_str(result); } return result; @@ -985,17 +898,11 @@ static const bool string_scan_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; -#if (defined(__GNUC__ ) || defined(__clang__)) -#define FORCE_INLINE __attribute__((always_inline)) -#else -#define FORCE_INLINE -#endif - #ifdef HAVE_SIMD static SIMD_Implementation simd_impl = SIMD_NONE; #endif /* HAVE_SIMD */ -static inline bool FORCE_INLINE string_scan(JSON_ParserState *state) +static ALWAYS_INLINE() bool string_scan(JSON_ParserState *state) { #ifdef HAVE_SIMD #if defined(HAVE_SIMD_NEON) @@ -1409,6 +1316,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } raise_parse_error("unreachable: %s", state); + return Qundef; } static void json_ensure_eof(JSON_ParserState *state) diff --git a/ext/json/simd/simd.h b/ext/json/simd/simd.h index 0abe4fad658813..194baee51c3475 100644 --- a/ext/json/simd/simd.h +++ b/ext/json/simd/simd.h @@ -1,6 +1,4 @@ -#ifdef JSON_DEBUG -#include -#endif +#include "../json.h" typedef enum { SIMD_NONE, @@ -22,9 +20,7 @@ typedef enum { static inline uint32_t trailing_zeros64(uint64_t input) { -#ifdef JSON_DEBUG - assert(input > 0); // __builtin_ctz(0) is undefined behavior -#endif + JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior #if HAVE_BUILTIN_CTZLL return __builtin_ctzll(input); @@ -41,9 +37,7 @@ static inline uint32_t trailing_zeros64(uint64_t input) static inline int trailing_zeros(int input) { -#ifdef JSON_DEBUG - assert(input > 0); // __builtin_ctz(0) is undefined behavior -#endif + JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior #if HAVE_BUILTIN_CTZLL return __builtin_ctz(input); @@ -58,12 +52,6 @@ static inline int trailing_zeros(int input) #endif } -#if (defined(__GNUC__ ) || defined(__clang__)) -#define FORCE_INLINE __attribute__((always_inline)) -#else -#define FORCE_INLINE -#endif - #ifdef JSON_ENABLE_SIMD #define SIMD_MINIMUM_THRESHOLD 6 @@ -81,14 +69,14 @@ static inline SIMD_Implementation find_simd_implementation(void) #define HAVE_SIMD_NEON 1 // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon -static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) +static ALWAYS_INLINE() uint64_t neon_match_mask(uint8x16_t matches) { const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); return mask & 0x8888888888888888ull; } -static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr) +static ALWAYS_INLINE() uint64_t compute_chunk_mask_neon(const char *ptr) { uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr); @@ -101,7 +89,7 @@ static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr) return neon_match_mask(needs_escape); } -static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask) +static ALWAYS_INLINE() int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask) { while (*ptr + sizeof(uint8x16_t) <= end) { uint64_t chunk_mask = compute_chunk_mask_neon(*ptr); @@ -148,7 +136,7 @@ static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table) #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) -static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *ptr) +static inline TARGET_SSE2 ALWAYS_INLINE() int compute_chunk_mask_sse2(const char *ptr) { __m128i chunk = _mm_loadu_si128((__m128i const*)ptr); // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33 @@ -159,7 +147,7 @@ static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *p return _mm_movemask_epi8(needs_escape); } -static inline TARGET_SSE2 FORCE_INLINE int string_scan_simd_sse2(const char **ptr, const char *end, int *mask) +static inline TARGET_SSE2 ALWAYS_INLINE() int string_scan_simd_sse2(const char **ptr, const char *end, int *mask) { while (*ptr + sizeof(__m128i) <= end) { int chunk_mask = compute_chunk_mask_sse2(*ptr); diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb index bab16a6fc21dbc..6315c3e667be85 100644 --- a/test/json/json_parser_test.rb +++ b/test/json/json_parser_test.rb @@ -344,6 +344,18 @@ def test_parse_big_integers assert_equal orig, parse(json5) end + def test_parse_escaped_key + doc = { + "test\r1" => 1, + "entries" => [ + "test\t2" => 2, + "test\n3" => 3, + ] + } + + assert_equal doc, parse(JSON.generate(doc)) + end + def test_parse_duplicate_key expected = {"a" => 2} expected_sym = {a: 2}