diff --git a/cmake/os/Windows.cmake b/cmake/os/Windows.cmake index cdb16cde46830..4e51bc6df00c8 100644 --- a/cmake/os/Windows.cmake +++ b/cmake/os/Windows.cmake @@ -222,5 +222,3 @@ include(${_SCRIPT_DIR}/WindowsCache.cmake) # this is out of place, not really a system check set(FN_NO_CASE_SENSE 1) set(USE_SYMDIR 1) -set(HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1) - diff --git a/config.h.cmake b/config.h.cmake index 90d1ed86a87f6..a757341ffaa3f 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -446,11 +446,6 @@ /* This should mean case insensitive file system */ #cmakedefine FN_NO_CASE_SENSE 1 -/* Whether an anonymous private mapping is unaccessible after -madvise(MADV_DONTNEED) or madvise(MADV_FREE) or similar has been invoked; -this is the case with Microsoft Windows VirtualFree(MEM_DECOMMIT) */ -#cmakedefine HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT 1 - #cmakedefine HAVE_CHARSET_armscii8 1 #cmakedefine HAVE_CHARSET_ascii 1 #cmakedefine HAVE_CHARSET_big5 1 diff --git a/include/my_virtual_mem.h b/include/my_virtual_mem.h index 8f9f6660e3b6d..a13c506382c9c 100644 --- a/include/my_virtual_mem.h +++ b/include/my_virtual_mem.h @@ -24,10 +24,15 @@ extern "C" { #endif -# ifdef _WIN32 +#ifdef _WIN32 char *my_virtual_mem_reserve(size_t *size); -# endif +#endif +#ifdef __linux__ +char *my_virtual_mem_commit(char *ptr, size_t size, my_bool commit); +#else char *my_virtual_mem_commit(char *ptr, size_t size); +# define my_virtual_mem_commit(ptr, size, c) my_virtual_mem_commit(ptr, size) +#endif void my_virtual_mem_decommit(char *ptr, size_t size); void my_virtual_mem_release(char *ptr, size_t size); diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index ddf9ea941f099..513bc5e90e29c 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -68,21 +68,6 @@ ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED -@@ -227,11 +227,11 @@ - SESSION_VALUE NULL - DEFAULT_VALUE 0 - VARIABLE_SCOPE GLOBAL --VARIABLE_TYPE BIGINT UNSIGNED -+VARIABLE_TYPE INT UNSIGNED - VARIABLE_COMMENT Maximum innodb_buffer_pool_size - NUMERIC_MIN_VALUE 0 --NUMERIC_MAX_VALUE 18446744073701163008 --NUMERIC_BLOCK_SIZE 8388608 -+NUMERIC_MAX_VALUE 4292870144 -+NUMERIC_BLOCK_SIZE 2097152 - ENUM_VALUE_LIST NULL - READ_ONLY YES - COMMAND_LINE_ARGUMENT REQUIRED @@ -239,7 +239,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 4f0ac6a53897b..5a6675fe4fbf1 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -4,6 +4,8 @@ variable_name not in ( 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS +'innodb_buffer_pool_commit', # only available on Linux +'innodb_buffer_pool_size_max', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing @@ -224,18 +226,6 @@ NUMERIC_BLOCK_SIZE 8388608 ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED -VARIABLE_NAME INNODB_BUFFER_POOL_SIZE_MAX -SESSION_VALUE NULL -DEFAULT_VALUE 0 -VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED -VARIABLE_COMMENT Maximum innodb_buffer_pool_size -NUMERIC_MIN_VALUE 0 -NUMERIC_MAX_VALUE 18446744073701163008 -NUMERIC_BLOCK_SIZE 8388608 -ENUM_VALUE_LIST NULL -READ_ONLY YES -COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_BUF_DUMP_STATUS_FREQUENCY SESSION_VALUE NULL DEFAULT_VALUE 0 diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test index 250eb8b5c8f1b..1f3cd9f31d5d0 100644 --- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test +++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test @@ -15,6 +15,8 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS + 'innodb_buffer_pool_commit', # only available on Linux + 'innodb_buffer_pool_size_max', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing diff --git a/mysys/my_largepage.c b/mysys/my_largepage.c index 22561ddc5640b..67aeff8eeca7a 100644 --- a/mysys/my_largepage.c +++ b/mysys/my_largepage.c @@ -482,13 +482,21 @@ char *my_large_virtual_alloc(size_t *size) DBUG_RETURN(ptr); } } + + my_use_large_pages= FALSE; } +# ifdef _AIX + /* On IBM AIX, my_virtual_mem_commit() relies on mprotect(2) rather than + a subsequent mmap(2) with MAP_FIXED. */ ptr= mmap(NULL, *size, PROT_READ | PROT_WRITE, MAP_PRIVATE | OS_MAP_ANON, -1, 0); +# else + ptr= mmap(NULL, *size, PROT_NONE, MAP_PRIVATE | OS_MAP_ANON, -1, 0); +# endif if (ptr == MAP_FAILED) { - my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), size); + my_error(EE_OUTOFMEMORY, MYF(ME_BELL + ME_ERROR_LOG), *size); ptr= NULL; } diff --git a/mysys/my_virtual_mem.c b/mysys/my_virtual_mem.c index 649d8c693ff21..badaf45648134 100644 --- a/mysys/my_virtual_mem.c +++ b/mysys/my_virtual_mem.c @@ -62,7 +62,7 @@ static my_bool is_memory_committed(char *ptr, size_t size) } #endif -char *my_virtual_mem_commit(char *ptr, size_t size) +char *my_virtual_mem_commit(char *ptr, size_t size, my_bool commit) { DBUG_ASSERT(ptr); #ifdef _WIN32 @@ -103,8 +103,8 @@ char *my_virtual_mem_commit(char *ptr, size_t size) # else void *p= 0; const int flags= -# ifdef MAP_POPULATE - MAP_POPULATE | +# ifdef __linux__ + (commit ? MAP_POPULATE : 0) | # endif MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED; p= mmap(ptr, size, PROT_READ | PROT_WRITE, flags, -1, 0); @@ -128,9 +128,6 @@ void my_virtual_mem_decommit(char *ptr, size_t size) { #ifdef _WIN32 DBUG_ASSERT(is_memory_committed(ptr, size)); -# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT -# error "VirtualFree(MEM_DECOMMIT) will not allow subsequent reads!" -# endif if (!my_use_large_pages) { if (!VirtualFree(ptr, size, MEM_DECOMMIT)) @@ -141,19 +138,6 @@ void my_virtual_mem_decommit(char *ptr, size_t size) } } #else - const int prot= -# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT - /* - In InnoDB, buf_pool_t::page_guess() may deference pointers to - this, assuming that either the original contents or zeroed - contents is available. - */ - PROT_READ -# else - /* We will explicitly mark the memory unaccessible. */ - PROT_NONE -# endif - ; # ifdef _AIX disclaim(ptr, size, DISCLAIM_ZEROMEM); # elif defined __linux__ || defined __osf__ @@ -172,7 +156,7 @@ void my_virtual_mem_decommit(char *ptr, size_t size) # else # warning "Do not know how to decommit memory" # endif - if (mprotect(ptr, size, prot)) + if (mprotect(ptr, size, PROT_NONE)) { my_error(EE_BADMEMORYRELEASE, MYF(ME_ERROR_LOG_ONLY), ptr, size, errno); DBUG_ASSERT(0); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index d5d6f29479b66..2f39bf9360459 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1374,8 +1374,8 @@ bool buf_pool_t::create() noexcept #ifdef UNIV_PFS_MEMORY PSI_MEMORY_CALL(memory_alloc)(mem_key_buf_buf_pool, actual_size, &owner); #endif -#ifdef _WIN32 - if (!my_virtual_mem_commit(memory, actual_size)) +#ifndef _AIX + if (!my_virtual_mem_commit(memory, actual_size, commit)) { my_virtual_mem_release(memory_unaligned, size_unaligned); memory= nullptr; @@ -1860,12 +1860,6 @@ inline void buf_pool_t::shrunk(size_t size, size_t reduced) noexcept ut_ad(size + reduced == size_in_bytes); size_in_bytes_requested= size; size_in_bytes= size; -# ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT - /* Only page_guess() may read this memory, which after - my_virtual_mem_decommit() may be zeroed out or preserve its original - contents. Try to catch any unintended reads outside page_guess(). */ - MEM_UNDEFINED(memory + size, size_in_bytes_max - size); -# else for (size_t n= page_hash.pad(page_hash.n_cells), i= 0; i < n; i+= page_hash.ELEMENTS_PER_LATCH + 1) { @@ -1876,7 +1870,6 @@ inline void buf_pool_t::shrunk(size_t size, size_t reduced) noexcept guess before we invoke my_virtual_mem_decommit() below. */ latch.unlock(); } -# endif my_virtual_mem_decommit(memory + size, reduced); #ifdef UNIV_PFS_MEMORY PSI_MEMORY_CALL(memory_free)(mem_key_buf_buf_pool, reduced, owner); @@ -1935,7 +1928,7 @@ ATTRIBUTE_COLD void buf_pool_t::resize(size_t size, THD *thd) noexcept if (n_blocks_removed <= 0) { - if (!my_virtual_mem_commit(memory + old_size, size - old_size)) + if (!my_virtual_mem_commit(memory + old_size, size - old_size, commit)) { mysql_mutex_unlock(&mutex); sql_print_error("InnoDB: Cannot commit innodb_buffer_pool_size=%zum;" @@ -2837,34 +2830,14 @@ uint32_t buf_pool_t::page_guess(buf_block_t *b, page_hash_latch &latch, const page_id_t id) noexcept { transactional_shared_lock_guard g{latch}; -#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT - /* shrunk() and my_virtual_mem_decommit() could retain the original - contents of the virtual memory range or zero it out immediately or - with a delay. Any zeroing out may lead to a false positive for - b->page.id() == id but never for b->page.state(). At the time of - the shrunk() call, shrink() and buf_LRU_block_free_non_file_page() - should guarantee that b->page.state() is equal to - buf_page_t::NOT_USED (0) for all to-be-freed blocks. */ -#else - /* shrunk() made the memory inaccessible. */ if (UNIV_UNLIKELY(reinterpret_cast(b) >= memory + size_in_bytes)) + /* shrunk() made the memory inaccessible. */ return 0; -#endif const page_id_t block_id{b->page.id()}; -#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT - /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able - to catch any unintended access elsewhere in our code. */ - MEM_MAKE_DEFINED(&block_id, sizeof block_id); -#endif if (id == block_id) { uint32_t state= b->page.state(); -#ifndef HAVE_UNACCESSIBLE_AFTER_MEM_DECOMMIT - /* shrunk() may have invoked MEM_UNDEFINED() on this memory to be able - to catch any unintended access elsewhere in our code. */ - MEM_MAKE_DEFINED(&state, sizeof state); -#endif /* Ignore guesses that point to read-fixed blocks. We can only avoid a race condition by looking up the block via page_hash. */ if ((state >= buf_page_t::FREED && state < buf_page_t::READ_FIX) || diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 8e164843f4f1c..a2b1c88f9b9a0 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -3661,6 +3661,13 @@ static void innodb_buffer_pool_size_update(THD* thd,st_mysql_sys_var*,void*, buf_pool.resize(*static_cast(save), thd); } +#ifdef __linux__ +static MYSQL_SYSVAR_BOOL(buffer_pool_commit, buf_pool.commit, + PLUGIN_VAR_NOCMDARG, + "Whether to disable Linux overcommit for innodb_buffer_pool_size", + NULL, NULL, FALSE); +#endif + static MYSQL_SYSVAR_SIZE_T(buffer_pool_size, buf_pool.size_in_bytes_requested, PLUGIN_VAR_RQCMDARG, "The size of the memory buffer InnoDB uses to cache data" @@ -3687,10 +3694,29 @@ static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_auto_min, innodb_buffer_pool_extent_size); #endif +#if SIZEOF_SIZE_T < 8 || defined _AIX || defined HAVE_valgrind +/* In constrained environments, innodb_buffer_pool_size_max +will default to the initial innodb_buffer_pool_size, that is, +by default, it will not be possible to increase innodb_buffer_pool_size. + +In MemorySanitizer and possibly Valgrind memcheck, any virtual memory +allocation would be backed by one or more copies of shadow bits of the +same size that could be allocated and initialized even for dummy +mappings created by mmap(2) with PROT_NONE. We do not want significant +overhead beyond the actual innodb_buffer_pool_size. */ +static constexpr size_t innodb_buffer_pool_size_max_default{0}, + innodb_buffer_pool_size_max_minimum{0}; +#else +static constexpr size_t innodb_buffer_pool_size_max_default{8ULL << 40},// 8TiB + innodb_buffer_pool_size_max_minimum{innodb_buffer_pool_extent_size}; +#endif + static MYSQL_SYSVAR_SIZE_T(buffer_pool_size_max, buf_pool.size_in_bytes_max, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Maximum innodb_buffer_pool_size", - nullptr, nullptr, 0, 0, + nullptr, nullptr, + innodb_buffer_pool_size_max_default, + innodb_buffer_pool_size_max_minimum, size_t(-ssize_t(innodb_buffer_pool_extent_size)), innodb_buffer_pool_extent_size); @@ -3782,11 +3808,10 @@ static int innodb_init_params() min= ut_calc_align (buf_pool.blocks_in_bytes(BUF_LRU_MIN_LEN + BUF_LRU_MIN_LEN / 4), 1U << 20); - size_t innodb_buffer_pool_size= buf_pool.size_in_bytes_requested; + const size_t innodb_buffer_pool_size= buf_pool.size_in_bytes_requested; - /* With large pages, buffer pool can't grow or shrink. */ - if (!buf_pool.size_in_bytes_max || my_use_large_pages || - innodb_buffer_pool_size > buf_pool.size_in_bytes_max) + if (innodb_buffer_pool_size > buf_pool.size_in_bytes_max || + my_use_large_pages /* large_pages=ON fixes innodb_buffer_pool_size */) buf_pool.size_in_bytes_max= ut_calc_align(innodb_buffer_pool_size, innodb_buffer_pool_extent_size); @@ -19916,6 +19941,9 @@ static MYSQL_SYSVAR_BOOL(encrypt_temporary_tables, innodb_encrypt_temporary_tabl static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(autoextend_increment), +#ifdef __linux__ + MYSQL_SYSVAR(buffer_pool_commit), +#endif MYSQL_SYSVAR(buffer_pool_size), #if defined __linux__ || !defined DBUG_OFF MYSQL_SYSVAR(buffer_pool_size_auto_min), diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index de456680747b6..4a43a272a6f02 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1797,6 +1797,10 @@ class buf_pool_t to read this for heuristic purposes without holding any mutex or latch */ +#ifdef __linux__ + /** whether to use MAP_POPULATE for the allocation */ + my_bool commit; +#endif /** Cleared when buf_LRU_get_free_block() fails. Set whenever the free list grows, along with a broadcast of done_free. Protected by buf_pool.mutex. */