Skip to content

Commit c02f883

Browse files
authored
Merge pull request #2 from Enmk/parsey_key_value_function-refactoring
Parsey key value function refactoring
2 parents 487296c + 89ebe61 commit c02f883

File tree

51 files changed

+1618
-1572
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1618
-1572
lines changed

base/base/find_symbols.h

Lines changed: 94 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ template <char ...chars> constexpr bool is_in(char x) { return ((x == chars) ||
4040

4141
static bool is_in(char c, const char * symbols, size_t num_chars)
4242
{
43-
for (auto i = 0u; i < num_chars; i++)
43+
for (size_t i = 0u; i < num_chars; ++i)
4444
{
4545
if (c == symbols[i])
4646
{
@@ -66,6 +66,43 @@ inline __m128i mm_is_in(__m128i bytes)
6666
__m128i eq = mm_is_in<s1, tail...>(bytes);
6767
return _mm_or_si128(eq0, eq);
6868
}
69+
70+
inline __m128i mm_is_in(__m128i bytes, const char * symbols, size_t num_chars)
71+
{
72+
__m128i accumulator = _mm_setzero_si128();
73+
for (size_t i = 0; i < num_chars; ++i)
74+
{
75+
__m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i]));
76+
accumulator = _mm_or_si128(accumulator, eq);
77+
}
78+
79+
return accumulator;
80+
}
81+
82+
inline std::vector<__m128i> mm_is_in_prepare(const char * symbols, size_t num_chars)
83+
{
84+
std::vector<__m128i> result;
85+
result.reserve(num_chars);
86+
87+
for (size_t i = 0; i < num_chars; ++i)
88+
{
89+
result.emplace_back(_mm_set1_epi8(symbols[i]));
90+
}
91+
92+
return result;
93+
}
94+
95+
inline __m128i mm_is_in_execute(__m128i bytes, const std::vector<__m128i> & needles)
96+
{
97+
__m128i accumulator = _mm_setzero_si128();
98+
for (const auto & needle : needles)
99+
{
100+
__m128i eq = _mm_cmpeq_epi8(bytes, needle);
101+
accumulator = _mm_or_si128(accumulator, eq);
102+
}
103+
104+
return accumulator;
105+
}
69106
#endif
70107

71108
template <bool positive>
@@ -112,6 +149,32 @@ inline const char * find_first_symbols_sse2(const char * const begin, const char
112149
return return_mode == ReturnMode::End ? end : nullptr;
113150
}
114151

152+
template <bool positive, ReturnMode return_mode>
153+
inline const char * find_first_symbols_sse2(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
154+
{
155+
const char * pos = begin;
156+
const auto needles = mm_is_in_prepare(symbols, num_chars);
157+
158+
#if defined(__SSE2__)
159+
for (; pos + 15 < end; pos += 16)
160+
{
161+
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
162+
163+
__m128i eq = mm_is_in_execute(bytes, needles);
164+
165+
uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq)));
166+
if (bit_mask)
167+
return pos + __builtin_ctz(bit_mask);
168+
}
169+
#endif
170+
171+
for (; pos < end; ++pos)
172+
if (maybe_negate<positive>(is_in(*pos, symbols, num_chars)))
173+
return pos;
174+
175+
return return_mode == ReturnMode::End ? end : nullptr;
176+
}
177+
115178

116179
template <bool positive, ReturnMode return_mode, char... symbols>
117180
inline const char * find_last_symbols_sse2(const char * const begin, const char * const end)
@@ -192,21 +255,6 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
192255
return return_mode == ReturnMode::End ? end : nullptr;
193256
}
194257

195-
196-
/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.
197-
198-
template <bool positive, ReturnMode return_mode, char... symbols>
199-
inline const char * find_first_symbols_dispatch(const char * begin, const char * end)
200-
requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
201-
{
202-
#if defined(__SSE4_2__)
203-
if (sizeof...(symbols) >= 5)
204-
return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>(begin, end);
205-
else
206-
#endif
207-
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
208-
}
209-
210258
template <bool positive, ReturnMode return_mode>
211259
inline const char * find_first_symbols_sse42(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
212260
{
@@ -215,7 +263,10 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
215263
#if defined(__SSE4_2__)
216264
constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;
217265

218-
const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(symbols));
266+
// This is to avoid read past end of `symbols` if `num_chars < 16`.
267+
char buffer[16] = {'\0'};
268+
memcpy(buffer, symbols, num_chars);
269+
const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buffer));
219270

220271
for (; pos + 15 < end; pos += 16)
221272
{
@@ -241,10 +292,30 @@ inline const char * find_first_symbols_sse42(const char * const begin, const cha
241292
return return_mode == ReturnMode::End ? end : nullptr;
242293
}
243294

295+
/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do.
296+
297+
template <bool positive, ReturnMode return_mode, char... symbols>
298+
inline const char * find_first_symbols_dispatch(const char * begin, const char * end)
299+
requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16)
300+
{
301+
#if defined(__SSE4_2__)
302+
if (sizeof...(symbols) >= 5)
303+
return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>(begin, end);
304+
else
305+
#endif
306+
return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end);
307+
}
308+
244309
template <bool positive, ReturnMode return_mode>
245-
auto find_first_symbols_sse42(std::string_view haystack, std::string_view symbols)
310+
inline const char * find_first_symbols_dispatch(const std::string_view haystack, const std::string_view symbols)
246311
{
247-
return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), symbols.size());
312+
const size_t num_chars = std::min<size_t>(symbols.size(), 16);
313+
#if defined(__SSE4_2__)
314+
if (num_chars >= 5)
315+
return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), num_chars);
316+
else
317+
#endif
318+
return find_first_symbols_sse2<positive, return_mode>(haystack.begin(), haystack.end(), symbols.begin(), num_chars);
248319
}
249320

250321
}
@@ -266,7 +337,7 @@ inline char * find_first_symbols(char * begin, char * end)
266337

267338
inline const char * find_first_symbols(std::string_view haystack, std::string_view symbols)
268339
{
269-
return detail::find_first_symbols_sse42<true, detail::ReturnMode::End>(haystack, symbols);
340+
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols);
270341
}
271342

272343
template <char... symbols>
@@ -283,7 +354,7 @@ inline char * find_first_not_symbols(char * begin, char * end)
283354

284355
inline const char * find_first_not_symbols(std::string_view haystack, std::string_view symbols)
285356
{
286-
return detail::find_first_symbols_sse42<false, detail::ReturnMode::End>(haystack, symbols);
357+
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols);
287358
}
288359

289360
template <char... symbols>
@@ -300,7 +371,7 @@ inline char * find_first_symbols_or_null(char * begin, char * end)
300371

301372
inline const char * find_first_symbols_or_null(std::string_view haystack, std::string_view symbols)
302373
{
303-
return detail::find_first_symbols_sse42<true, detail::ReturnMode::Nullptr>(haystack, symbols);
374+
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, symbols);
304375
}
305376

306377
template <char... symbols>
@@ -317,7 +388,7 @@ inline char * find_first_not_symbols_or_null(char * begin, char * end)
317388

318389
inline const char * find_first_not_symbols_or_null(std::string_view haystack, std::string_view symbols)
319390
{
320-
return detail::find_first_symbols_sse42<false, detail::ReturnMode::Nullptr>(haystack, symbols);
391+
return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, symbols);
321392
}
322393

323394
template <char... symbols>

src/Common/tests/gtest_find_symbols.cpp

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ void test_find_first_not(const std::string & haystack, const std::string & symbo
2323

2424
TEST(FindSymbols, SimpleTest)
2525
{
26-
std::string s = "Hello, world! Goodbye...";
26+
const std::string s = "Hello, world! Goodbye...";
2727
const char * begin = s.data();
2828
const char * end = s.data() + s.size();
2929

@@ -34,6 +34,9 @@ TEST(FindSymbols, SimpleTest)
3434
ASSERT_EQ(find_first_symbols<'H'>(begin, end), begin);
3535
ASSERT_EQ((find_first_symbols<'a', 'e'>(begin, end)), begin + 1);
3636

37+
ASSERT_EQ((find_first_symbols<'a', 'e', 'w', 'x', 'z'>(begin, end)), begin + 1);
38+
ASSERT_EQ((find_first_symbols<'p', 'q', 's', 'x', 'z'>(begin, end)), end);
39+
3740
ASSERT_EQ(find_last_symbols_or_null<'a'>(begin, end), nullptr);
3841
ASSERT_EQ(find_last_symbols_or_null<'e'>(begin, end), end - 4);
3942
ASSERT_EQ(find_last_symbols_or_null<'.'>(begin, end), end - 1);
@@ -54,6 +57,141 @@ TEST(FindSymbols, SimpleTest)
5457
}
5558
}
5659

60+
template <bool positive, detail::ReturnMode return_mode>
61+
inline const char * find_first_symbols_sse42_MY(const char * const begin, const char * const end, const char * symbols, size_t num_chars)
62+
{
63+
using namespace detail;
64+
const char * pos = begin;
65+
66+
#if defined(__SSE4_2__)
67+
constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT;
68+
69+
#if defined(__AVX512F__) || defined(__AVX512BW__) || defined(__AVX__) || defined(__AVX2__)
70+
71+
#else
72+
// This is to avoid read past end of allocated string while loading `set` from `symbols` if `num_chars < 16`.
73+
char buffer[16] = {'\0'};
74+
memcpy(buffer, symbols, num_chars);
75+
const __m128i set = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buffer));
76+
#endif
77+
78+
for (; pos + 15 < end; pos += 16)
79+
{
80+
__m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos));
81+
82+
if constexpr (positive)
83+
{
84+
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode))
85+
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode);
86+
}
87+
else
88+
{
89+
if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY))
90+
return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY);
91+
}
92+
}
93+
#endif
94+
95+
for (; pos < end; ++pos)
96+
if (maybe_negate<positive>(is_in(*pos, symbols, num_chars)))
97+
return pos;
98+
99+
return return_mode == ReturnMode::End ? end : nullptr;
100+
}
101+
102+
template <char... symbols>
103+
inline const char * find_first_symbols_MY(const char * begin, const char * end)
104+
{
105+
return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, end);
106+
}
107+
108+
TEST(FindSymbols, RunTimeNeedle)
109+
{
110+
auto test_haystack = [](const auto & haystack, const auto & unfindable_needle) {
111+
#define TEST_HAYSTACK_AND_NEEDLE(haystack_, needle_) \
112+
do { \
113+
const auto & h = haystack_; \
114+
const auto & n = needle_; \
115+
EXPECT_EQ( \
116+
std::find_first_of(h.data(), h.data() + h.size(), n.data(), n.data() + n.size()), \
117+
find_first_symbols(h, n) \
118+
) << "haystack: \"" << h << "\" (" << static_cast<const void*>(h.data()) << ")" \
119+
<< ", needle: \"" << n << "\""; \
120+
} \
121+
while (false)
122+
123+
// can't find needle
124+
TEST_HAYSTACK_AND_NEEDLE(haystack, unfindable_needle);
125+
126+
#define TEST_WITH_MODIFIED_NEEDLE(haystack, in_needle, needle_update_statement) \
127+
do \
128+
{ \
129+
std::string needle = (in_needle); \
130+
(needle_update_statement); \
131+
TEST_HAYSTACK_AND_NEEDLE(haystack, needle); \
132+
} \
133+
while (false)
134+
135+
// findable symbol is at beginnig of the needle
136+
// Can find at first pos of haystack
137+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.front());
138+
// Can find at first pos of haystack
139+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack.back());
140+
// Can find in the middle of haystack
141+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.front() = haystack[haystack.size() / 2]);
142+
143+
// findable symbol is at end of the needle
144+
// Can find at first pos of haystack
145+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.front());
146+
// Can find at first pos of haystack
147+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack.back());
148+
// Can find in the middle of haystack
149+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle.back() = haystack[haystack.size() / 2]);
150+
151+
// findable symbol is in the middle of the needle
152+
// Can find at first pos of haystack
153+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.front());
154+
// Can find at first pos of haystack
155+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack.back());
156+
// Can find in the middle of haystack
157+
TEST_WITH_MODIFIED_NEEDLE(haystack, unfindable_needle, needle[needle.size() / 2] = haystack[haystack.size() / 2]);
158+
159+
#undef TEST_WITH_MODIFIED_NEEDLE
160+
#undef TEST_HAYSTACK_AND_NEEDLE
161+
};
162+
163+
// there are 4 major groups of cases:
164+
// haystack < 16 bytes, haystack > 16 bytes
165+
// needle < 5 bytes, needle >= 5 bytes
166+
167+
// First and last symbols of haystack should be unique
168+
const std::string long_haystack = "Hello, world! Goodbye...?";
169+
const std::string short_haystack = "Hello, world!";
170+
171+
// In sync with find_first_symbols_dispatch code: long needles receve special treatment.
172+
// as of now "long" means >= 5
173+
const std::string unfindable_long_needle = "0123456789ABCDEF";
174+
const std::string unfindable_short_needle = "0123";
175+
176+
{
177+
SCOPED_TRACE("Long haystack");
178+
test_haystack(long_haystack, unfindable_long_needle);
179+
test_haystack(long_haystack, unfindable_short_needle);
180+
}
181+
182+
{
183+
SCOPED_TRACE("Short haystack");
184+
test_haystack(short_haystack, unfindable_long_needle);
185+
test_haystack(short_haystack, unfindable_short_needle);
186+
}
187+
188+
// Check that nothing matches on big haystack,
189+
EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQRSTUVWXYZacfghijkmnpqstuvxz"), long_haystack.data() + long_haystack.size());
190+
191+
// only 16 bytes of haystack are checked, so nothing is found
192+
EXPECT_EQ(find_first_symbols(long_haystack, "ABCDEFIJKLMNOPQR0helloworld"), long_haystack.data() + long_haystack.size());
193+
}
194+
57195
TEST(FindNotSymbols, AllSymbolsPresent)
58196
{
59197
std::string str_with_17_bytes = "hello world hello";

src/Functions/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,7 @@ add_subdirectory(JSONPath)
104104
list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath)
105105

106106
add_subdirectory(keyvaluepair)
107-
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs_core>)
108-
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs_api>)
107+
list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_extractkeyvaluepairs>)
109108

110109
# Signed integer overflow on user-provided data inside boost::geometry - ignore.
111110
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)

src/Functions/keyvaluepair/api/ArgumentExtractor.cpp renamed to src/Functions/keyvaluepair/ArgumentExtractor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "ArgumentExtractor.h"
1+
#include <Functions/keyvaluepair/ArgumentExtractor.h>
22

33
namespace DB
44
{

src/Functions/keyvaluepair/api/ArgumentExtractor.h renamed to src/Functions/keyvaluepair/ArgumentExtractor.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
#pragma once
22

3-
#include <optional>
4-
53
#include <Columns/IColumn.h>
64
#include <Core/ColumnsWithTypeAndName.h>
75

6+
#include <optional>
7+
88
namespace DB
99
{
1010

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
1-
add_subdirectory(src)
2-
add_subdirectory(api)
1+
include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
2+
add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs .)
3+
add_headers_and_sources(clickhouse_functions_extractkeyvaluepairs impl)
4+
5+
add_library(clickhouse_functions_extractkeyvaluepairs ${clickhouse_functions_extractkeyvaluepairs_sources} ${clickhouse_functions_extractkeyvaluepairs_headers})
6+
7+
target_link_libraries(clickhouse_functions_extractkeyvaluepairs PRIVATE dbms)

src/Functions/keyvaluepair/api/CMakeLists.txt

Lines changed: 0 additions & 6 deletions
This file was deleted.

0 commit comments

Comments
 (0)