diff --git a/Makefile.am b/Makefile.am index 1f9db1e8..3665e774 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,8 @@ lib_LTLIBRARIES = libgumbo.la libgumbo_la_CFLAGS = -Wall libgumbo_la_LDFLAGS = -version-info 1:0:0 -no-undefined libgumbo_la_SOURCES = \ + src/arena.c \ + src/arena.h \ src/attribute.c \ src/attribute.h \ src/char_ref.c \ diff --git a/benchmarks/benchmark.cc b/benchmarks/benchmark.cc index 9c2c1c86..e3da8c34 100644 --- a/benchmarks/benchmark.cc +++ b/benchmarks/benchmark.cc @@ -25,7 +25,7 @@ #include "gumbo.h" -static const int kNumReps = 10; +static const int kNumReps = 200; int main(int argc, char** argv) { if (argc != 1) { diff --git a/configure.ac b/configure.ac index e2d4e9f2..8f12ce06 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ # Process this file with autoconf to produce a configure script. AC_PREREQ([2.65]) -AC_INIT([gumbo], [0.9.2], [jonathan.d.tang@gmail.com]) +AC_INIT([gumbo], [1.0.0], [jonathan.d.tang@gmail.com]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_SRCDIR([src/parser.c]) #AC_CONFIG_HEADERS([config.h]) diff --git a/setup.py b/setup.py index 1c20cef5..67240c41 100644 --- a/setup.py +++ b/setup.py @@ -169,7 +169,7 @@ def run(self): ] setup(name='gumbo', - version='0.9.2', + version='0.9.4', description='Python bindings for Gumbo HTML parser', long_description=README, url='http://github.com/google/gumbo-parser', diff --git a/src/arena.c b/src/arena.c new file mode 100644 index 00000000..fedff8d7 --- /dev/null +++ b/src/arena.c @@ -0,0 +1,105 @@ +// Copyright 2015 Jonathan Tang. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jonathan.d.tang@gmail.com (Jonathan Tang) + +#include "arena.h" + +#include +#include + +#include "util.h" + +unsigned int gChunksAllocated; + +// Alignment of each returned allocation block. We make sure everything is +// pointer-aligned. +#define ARENA_ALIGNMENT (sizeof(void*)) + +// Size of a single arena chunk. Most recent Intel CPUs have a 256K L2 cache +// on-core, so we try to size a chunk to fit in that with a little extra room +// for the stack. Measurements on a corpus of ~60K webpages indicate that +// ... +#define ARENA_CHUNK_SIZE 240000 + +typedef struct GumboInternalArenaChunk { + struct GumboInternalArenaChunk* next; + char data[ARENA_CHUNK_SIZE]; +} GumboArenaChunk; + +void arena_init(GumboArena* arena) { + assert(arena != NULL); + arena->head = malloc(sizeof(GumboArenaChunk)); + arena->head->next = NULL; + arena->allocation_ptr = arena->head->data; + gumbo_debug("Initializing arena @%x\n", arena->head); + gChunksAllocated = 1; +} + +void arena_destroy(GumboArena* arena) { + GumboArenaChunk* chunk = arena->head; + while (chunk) { + gumbo_debug("Freeing arena chunk @%x\n", chunk); + GumboArenaChunk* to_free = chunk; + chunk = chunk->next; + free(to_free); + } +} + +static void* allocate_new_chunk(GumboArena* arena, size_t size) { + GumboArenaChunk* new_chunk = malloc(size); + gumbo_debug("Allocating new arena chunk of size %d @%x\n", size, new_chunk); + if (!new_chunk) { + gumbo_debug("Malloc failed.\n"); + return NULL; + } + ++gChunksAllocated; + new_chunk->next = arena->head; + arena->head = new_chunk; + return new_chunk->data; +} + +void* arena_malloc(GumboArena* arena, size_t size) { + size_t aligned_size = (size + ARENA_ALIGNMENT - 1) & ~(ARENA_ALIGNMENT - 1); + if (arena->allocation_ptr >= + arena->head->data + ARENA_CHUNK_SIZE - aligned_size) { + if (size > ARENA_CHUNK_SIZE) { + // Big block requested; we allocate a chunk of memory of the requested + // size, add it to the list, and then immediately allocate another one. + gumbo_debug( + "Allocation size %d exceeds chunk size %d", size, ARENA_CHUNK_SIZE); + size_t total_chunk_size = + size + sizeof(GumboArenaChunk) - ARENA_CHUNK_SIZE; + void* result = allocate_new_chunk(arena, total_chunk_size); + arena->allocation_ptr = + allocate_new_chunk(arena, sizeof(GumboArenaChunk)); + return result; + } + // Normal operation: allocate the default arena chunk size. + arena->allocation_ptr = allocate_new_chunk(arena, sizeof(GumboArenaChunk)); + } + void* obj = arena->allocation_ptr; + arena->allocation_ptr += aligned_size; + assert(arena->allocation_ptr <= arena->head->data + ARENA_CHUNK_SIZE); + return obj; +} + +unsigned int gumbo_arena_chunks_allocated() { + return gChunksAllocated; +} + +void arena_free(void* userdata, void* obj) { + // No-op. +} + diff --git a/src/arena.h b/src/arena.h new file mode 100644 index 00000000..5238b5be --- /dev/null +++ b/src/arena.h @@ -0,0 +1,43 @@ +// Copyright 2015 Jonathan Tang. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: jonathan.d.tang@gmail.com (Jonathan Tang) + +#ifndef GUMBO_ARENA_H_ +#define GUMBO_ARENA_H_ + +#include "gumbo.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Initialize an arena, allocating the first chunk for it. +void arena_init(GumboArena* arena); + +// Destroy an arena, freeing all memory used by it and all objects contained. +void arena_destroy(GumboArena* arena); + +// Allocate an object in an arena. chunk_size must remain constant between +// allocations. Returns NULL if the system malloc fails. +void* arena_malloc(GumboArena* arena, size_t size); + +// No-op free function for use as a custom allocator. +void arena_free(void* arena, void* obj); + +#ifdef __cplusplus +} +#endif + +#endif // GUMBO_ARENA_H_ diff --git a/src/error.c b/src/error.c index 0cae4639..563dfde2 100644 --- a/src/error.c +++ b/src/error.c @@ -35,10 +35,11 @@ static const size_t kMessageBufferSize = 256; static int print_message(GumboParser* parser, GumboStringBuffer* output, const char* format, ...) { va_list args; - va_start(args, format); int remaining_capacity = output->capacity - output->length; + va_start(args, format); int bytes_written = vsnprintf(output->data + output->length, remaining_capacity, format, args); + va_end(args); #ifdef _MSC_VER if (bytes_written == -1) { // vsnprintf returns -1 on MSVC++ if there's not enough capacity, instead of @@ -47,6 +48,7 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, // we retry (letting it fail and returning 0 if it doesn't), since there's // no way to smartly resize the buffer. gumbo_string_buffer_reserve(parser, output->capacity * 2, output); + va_start(args, format); int result = vsnprintf(output->data + output->length, remaining_capacity, format, args); va_end(args); @@ -55,7 +57,6 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, #else // -1 in standard C99 indicates an encoding error. Return 0 and do nothing. if (bytes_written == -1) { - va_end(args); return 0; } #endif @@ -64,11 +65,12 @@ static int print_message(GumboParser* parser, GumboStringBuffer* output, gumbo_string_buffer_reserve( parser, output->capacity + bytes_written, output); remaining_capacity = output->capacity - output->length; + va_start(args, format); bytes_written = vsnprintf(output->data + output->length, remaining_capacity, format, args); + va_end(args); } output->length += bytes_written; - va_end(args); return bytes_written; } diff --git a/src/gumbo.h b/src/gumbo.h index cbc2b0f0..92094836 100644 --- a/src/gumbo.h +++ b/src/gumbo.h @@ -576,18 +576,6 @@ typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr); * Use kGumboDefaultOptions for sensible defaults, and only set what you need. */ typedef struct GumboInternalOptions { - /** A memory allocator function. Default: malloc. */ - GumboAllocatorFunction allocator; - - /** A memory deallocator function. Default: free. */ - GumboDeallocatorFunction deallocator; - - /** - * An opaque object that's passed in as the first argument to all callbacks - * used by this library. Default: NULL. - */ - void* userdata; - /** * The tab-stop size, for computing positions in source code that uses tabs. * Default: 8. @@ -613,6 +601,16 @@ typedef struct GumboInternalOptions { /** Default options struct; use this with gumbo_parse_with_options. */ extern const GumboOptions kGumboDefaultOptions; +/** Base struct for an arena. */ +struct GumboInternalArenaChunk; + +typedef struct GumboInternalArena { + struct GumboInternalArenaChunk* head; + char* allocation_ptr; +} GumboArena; + +unsigned int gumbo_arena_chunks_allocated(); + /** The output struct containing the results of the parse. */ typedef struct GumboInternalOutput { /** @@ -635,6 +633,26 @@ typedef struct GumboInternalOutput { * reported so we can work out something appropriate for your use-case. */ GumboVector /* GumboError */ errors; + + /** + * Arena for default memory allocation. This is initialized on parse start + * when using the default memory allocator; it consumes little memory (a + * couple pointers) when a custom memory allocator is supplied. + */ + GumboArena arena; + + /** + * Flag set if an out-of-memory condition occurs. This can either be because + * a stringbuffer or vector requested a single chunk larger than the arena + * chunk size, or because the system malloc failed. (The latter is not + * implemented yet - on most modern OSes, malloc never returns NULL and + * instead overcommits virtual memory.) Gumbo makes its best effort to + * recover from OOM errors: if the reason was that a buffer exceeded maximum + * chunk size, it truncates that buffer at the maximum chunk size, refuses to + * write to it anymore, and continues parsing. If the system malloc fails, it + * returns the parse tree it's parsed up until that point. + */ + bool out_of_memory; } GumboOutput; /** diff --git a/src/parser.c b/src/parser.c index 8ca85933..85becb07 100644 --- a/src/parser.c +++ b/src/parser.c @@ -21,6 +21,7 @@ #include #include +#include "arena.h" #include "attribute.h" #include "error.h" #include "gumbo.h" @@ -56,18 +57,7 @@ static bool handle_in_template(GumboParser*, GumboToken*); static GumboNode* destroy_node(GumboParser*, GumboNode*); -static void* malloc_wrapper(void* unused, size_t size) { - return malloc(size); -} - -static void free_wrapper(void* unused, void* ptr) { - free(ptr); -} - const GumboOptions kGumboDefaultOptions = { - &malloc_wrapper, - &free_wrapper, - NULL, 8, false, -1, @@ -501,11 +491,12 @@ static GumboNode* new_document_node(GumboParser* parser) { return document_node; } -static void output_init(GumboParser* parser) { - GumboOutput* output = gumbo_parser_allocate(parser, sizeof(GumboOutput)); +static void output_init(GumboParser* parser, GumboOutput* output) { output->root = NULL; output->document = new_document_node(parser); - parser->_output = output; + // Arena is initialized before this is called, so we have memory to initialize + // the parser state. + output->out_of_memory = false; gumbo_init_errors(parser); } @@ -938,8 +929,7 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { insert_node(parser, text_node, location); } - gumbo_string_buffer_destroy(parser, &buffer_state->_buffer); - gumbo_string_buffer_init(parser, &buffer_state->_buffer); + gumbo_string_buffer_clear(parser, &buffer_state->_buffer); buffer_state->_type = GUMBO_NODE_WHITESPACE; assert(buffer_state->_buffer.length == 0); } @@ -4056,10 +4046,16 @@ GumboOutput* gumbo_parse_fragment( const GumboTag fragment_ctx, const GumboNamespaceEnum fragment_namespace) { GumboParser parser; parser._options = options; + // Must come first, since all the other init functions allocate memory. The + // arena is stored in the GumboOutput structure, so that must be allocated + // manually. + parser._output = malloc(sizeof(GumboOutput)); + arena_init(&parser._output->arena); + // Next initialize the parser state. parser_state_init(&parser); // Must come after parser_state_init, since creating the document node must // reference parser_state->_current_node. - output_init(&parser); + output_init(&parser, parser._output); // And this must come after output_init, because initializing the tokenizer // reads the first character and that may cause a UTF-8 decode error // (inserting into output->errors) if that's invalid. @@ -4079,6 +4075,11 @@ GumboOutput* gumbo_parse_fragment( GumboToken token; bool has_error = false; + if (setjmp(parser._out_of_memory_jmp)) { + parser._output->out_of_memory = true; + return parser._output; + } + do { if (state->_reprocess_current_token) { state->_reprocess_current_token = false; @@ -4156,18 +4157,6 @@ GumboOutput* gumbo_parse_fragment( } void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output) { - // Need a dummy GumboParser because the allocator comes along with the - // options object. - GumboParser parser; - parser._parser_state = NULL; - parser._options = options; - GumboNode* current = output->document; - while (current) { - current = destroy_node(&parser, current); - } - for (int i = 0; i < output->errors.length; ++i) { - gumbo_error_destroy(&parser, output->errors.data[i]); - } - gumbo_vector_destroy(&parser, &output->errors); - gumbo_parser_deallocate(&parser, output); + arena_destroy(&output->arena); + free(output); } diff --git a/src/parser.h b/src/parser.h index 95019e3e..7ecd4e34 100644 --- a/src/parser.h +++ b/src/parser.h @@ -20,6 +20,8 @@ #ifndef GUMBO_PARSER_H_ #define GUMBO_PARSER_H_ +#include + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +50,10 @@ typedef struct GumboInternalParser { // The internal parser state. Initialized on parse start and destroyed on // parse end; end-users will never see a non-garbage value in this pointer. struct GumboInternalParserState* _parser_state; + + // A jmp_buf to use in case of out-of-memory conditions. This jumps back to + // gumbo_parse, which then returns after setting the out_of_memory flag. + jmp_buf _out_of_memory_jmp; } GumboParser; #ifdef __cplusplus diff --git a/src/string_buffer.c b/src/string_buffer.c index f7d9712f..941bcd1e 100644 --- a/src/string_buffer.c +++ b/src/string_buffer.c @@ -26,7 +26,11 @@ struct GumboInternalParser; -static const size_t kDefaultStringBufferSize = 10; +// Size chosen via statistical analysis of ~60K websites. +// 99% of text nodes and 98% of attribute names/values fit within 5 characters. +// Since the arena allocator only ever returns word-aligned chunks, however, it +// makes no sense to use less than 8 chars. +static const size_t kDefaultStringBufferSize = 8; static void maybe_resize_string_buffer( struct GumboInternalParser* parser, size_t additional_chars, @@ -94,12 +98,25 @@ void gumbo_string_buffer_append_string( char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input) { - char* buffer = gumbo_parser_allocate(parser, input->length + 1); - memcpy(buffer, input->data, input->length); + maybe_resize_string_buffer(parser, input->length + 1, input); + char* buffer = input->data; buffer[input->length] = '\0'; + gumbo_string_buffer_init(parser, input); return buffer; } +void gumbo_string_buffer_clear( + struct GumboInternalParser* parser, GumboStringBuffer* input) { + input->length = 0; + if (input->capacity > kDefaultStringBufferSize * 8) { + // This approach to clearing means that the buffer can grow unbounded and + // tie up memory that may be needed for parsing the rest of the document, so + // we free and reinitialize the buffer if its grown more than 3 doublings. + gumbo_string_buffer_destroy(parser, input); + gumbo_string_buffer_init(parser, input); + } +} + void gumbo_string_buffer_destroy( struct GumboInternalParser* parser, GumboStringBuffer* buffer) { gumbo_parser_deallocate(parser, buffer->data); diff --git a/src/string_buffer.h b/src/string_buffer.h index 4ddff8a9..6213ccb2 100644 --- a/src/string_buffer.h +++ b/src/string_buffer.h @@ -50,7 +50,8 @@ void gumbo_string_buffer_init( // Ensures that the buffer contains at least a certain amount of space. Most // useful with snprintf and the other length-delimited string functions, which -// may want to write directly into the buffer. +// may want to write directly into the buffer. Returns false on an allocation +// failure - the client should *not* try to write to the buffer in this case. void gumbo_string_buffer_reserve( struct GumboInternalParser* parser, size_t min_capacity, GumboStringBuffer* output); @@ -70,6 +71,11 @@ void gumbo_string_buffer_append_string( char* gumbo_string_buffer_to_string( struct GumboInternalParser* parser, GumboStringBuffer* input); +// Reinitialize this string buffer. This clears it by setting length=0. It +// does not zero out the buffer itself. +void gumbo_string_buffer_clear( + struct GumboInternalParser* parser, GumboStringBuffer* input); + // Deallocates this GumboStringBuffer. void gumbo_string_buffer_destroy( struct GumboInternalParser* parser, GumboStringBuffer* buffer); diff --git a/src/tokenizer.c b/src/tokenizer.c index 8c0b4db4..89bbb499 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -356,12 +356,10 @@ static void clear_temporary_buffer(GumboParser* parser) { GumboTokenizerState* tokenizer = parser->_tokenizer_state; assert(!tokenizer->_temporary_buffer_emit); utf8iterator_mark(&tokenizer->_input); - gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_temporary_buffer); // The temporary buffer and script data buffer are the same object in the // spec, so the script data buffer should be cleared as well. - gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); } // Appends a codepoint to the temporary buffer. @@ -697,7 +695,11 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) { gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer); assert(tag_state->_attributes.data == NULL); - gumbo_vector_init(parser, 4, &tag_state->_attributes); + // Initial size chosen by statistical analysis of a corpus of 60k webpages. + // 99.5% of elements have 0 elements, 93% of the remainder have 1. These + // numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1 + // for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs. + gumbo_vector_init(parser, 1, &tag_state->_attributes); tag_state->_drop_next_attr_value = false; tag_state->_is_start_tag = is_start_tag; tag_state->_is_self_closing = false; @@ -1591,8 +1593,7 @@ static StateResult handle_script_double_escaped_lt_state( int c, GumboToken* output) { if (c == '/') { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END); - gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer); - gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer); + gumbo_string_buffer_clear(parser, &tokenizer->_script_data_buffer); return emit_current_char(parser, output); } else { gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED); diff --git a/src/util.c b/src/util.c index a3dafd79..1bc89f91 100644 --- a/src/util.c +++ b/src/util.c @@ -17,12 +17,14 @@ #include "util.h" #include +#include #include #include #include #include #include +#include "arena.h" #include "gumbo.h" #include "parser.h" @@ -32,12 +34,16 @@ const GumboSourcePosition kGumboEmptySourcePosition = { 0, 0, 0 }; void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) { - return parser->_options->allocator(parser->_options->userdata, num_bytes); + void* result = arena_malloc( + &parser->_output->arena, + num_bytes); + if (result == NULL) { + longjmp(parser->_out_of_memory_jmp, num_bytes); + } + return result; } -void gumbo_parser_deallocate(GumboParser* parser, void* ptr) { - parser->_options->deallocator(parser->_options->userdata, ptr); -} +void gumbo_parser_deallocate(GumboParser* parser, void* ptr) {} char* gumbo_copy_stringz(GumboParser* parser, const char* str) { char* buffer = gumbo_parser_allocate(parser, strlen(str) + 1); @@ -45,6 +51,10 @@ char* gumbo_copy_stringz(GumboParser* parser, const char* str) { return buffer; } +void gumbo_set_out_of_memory(GumboParser* parser) { + parser->_output->out_of_memory = true; +} + // Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG // to use. void gumbo_debug(const char* format, ...) { diff --git a/src/util.h b/src/util.h index 28b6905b..4817f77f 100644 --- a/src/util.h +++ b/src/util.h @@ -51,6 +51,9 @@ void* gumbo_parser_allocate( // config options. void gumbo_parser_deallocate(struct GumboInternalParser* parser, void* ptr); +// Sets the out-of-memory flag on the output. +void gumbo_set_out_of_memory(struct GumboInternalParser* parser); + // Debug wrapper for printf, to make it easier to turn off debugging info when // required. void gumbo_debug(const char* format, ...); diff --git a/tests/string_buffer.cc b/tests/string_buffer.cc index b8966cf8..daf0954b 100644 --- a/tests/string_buffer.cc +++ b/tests/string_buffer.cc @@ -47,7 +47,7 @@ class GumboStringBufferTest : public GumboTest { TEST_F(GumboStringBufferTest, Reserve) { gumbo_string_buffer_reserve(&parser_, 21, &buffer_); - EXPECT_EQ(40, buffer_.capacity); + EXPECT_EQ(32, buffer_.capacity); strcpy(buffer_.data, "01234567890123456789"); buffer_.length = 20; NullTerminateBuffer(); @@ -95,6 +95,7 @@ TEST_F(GumboStringBufferTest, AppendCodepoint_4Bytes) { } TEST_F(GumboStringBufferTest, ToString) { + gumbo_string_buffer_reserve(&parser_, 8, &buffer_); strcpy(buffer_.data, "012345"); buffer_.length = 7; diff --git a/tests/string_piece.cc b/tests/string_piece.cc index 965ee5aa..4040bfdc 100644 --- a/tests/string_piece.cc +++ b/tests/string_piece.cc @@ -74,13 +74,12 @@ TEST_F(GumboStringPieceTest, CaseNotEqual_Str2Shorter) { } TEST_F(GumboStringPieceTest, Copy) { - GumboParser parser; - parser._options = &kGumboDefaultOptions; + GumboParser* parser = &parser_; INIT_GUMBO_STRING(str1, "bar"); GumboStringPiece str2; - gumbo_string_copy(&parser, &str2, &str1); + gumbo_string_copy(parser, &str2, &str1); EXPECT_TRUE(gumbo_string_equals(&str1, &str2)); - gumbo_parser_deallocate(&parser, (void*) str2.data); + gumbo_parser_deallocate(parser, (void*) str2.data); } } // namespace diff --git a/tests/test_utils.cc b/tests/test_utils.cc index 7fc47711..f57bd73c 100644 --- a/tests/test_utils.cc +++ b/tests/test_utils.cc @@ -16,6 +16,7 @@ #include "test_utils.h" +#include "arena.h" #include "error.h" #include "util.h" @@ -142,54 +143,14 @@ void SanityCheckPointers(const char* input, size_t input_length, } } -// Custom allocator machinery to sanity check for memory leaks. Normally we can -// use heapcheck/valgrind/ASAN for this, but they only give the -// results when the program terminates. This means that if the parser is run in -// a loop (say, a MapReduce) and there's a leak, it may end up exhausting memory -// before it can catch the particular document responsible for the leak. These -// allocators let us check each document individually for leaks. - -static void* LeakDetectingMalloc(void* userdata, size_t size) { - MallocStats* stats = static_cast(userdata); - stats->bytes_allocated += size; - ++stats->objects_allocated; - // Arbitrary limit of 2G on allocation; parsing any reasonable document - // shouldn't take more than that. - assert(stats->bytes_allocated < (1 << 31)); - void* obj = malloc(size); - // gumbo_debug("Allocated %u bytes at %x.\n", size, obj); - return obj; -} - -static void LeakDetectingFree(void* userdata, void* ptr) { - MallocStats* stats = static_cast(userdata); - if (ptr) { - ++stats->objects_freed; - // gumbo_debug("Freed %x.\n"); - free(ptr); - } -} - -void InitLeakDetection(GumboOptions* options, MallocStats* stats) { - stats->bytes_allocated = 0; - stats->objects_allocated = 0; - stats->objects_freed = 0; - - options->allocator = LeakDetectingMalloc; - options->deallocator = LeakDetectingFree; - options->userdata = stats; -} - - GumboTest::GumboTest() : options_(kGumboDefaultOptions), errors_are_expected_(false), text_("") { - InitLeakDetection(&options_, &malloc_stats_); options_.max_errors = 100; parser_._options = &options_; - parser_._output = static_cast( - gumbo_parser_allocate(&parser_, sizeof(GumboOutput))); + parser_._output = static_cast(calloc(1, sizeof(GumboOutput))); + arena_init(&parser_._output->arena); gumbo_init_errors(&parser_); } @@ -204,7 +165,5 @@ GumboTest::~GumboTest() { parser_._output->errors.data[i]), text_); } } - gumbo_destroy_errors(&parser_); - gumbo_parser_deallocate(&parser_, parser_._output); - EXPECT_EQ(malloc_stats_.objects_allocated, malloc_stats_.objects_freed); + gumbo_destroy_output(parser_._options, parser_._output); }