From 981ee02c7c664f19b983662d618d5e6bd87d1739 Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Tue, 21 Nov 2017 20:30:00 +0900 Subject: [PATCH 1/7] Fix performance problem with /k/i and /s/i (Close k-takata/Onigmo#97) E.g. For the pattern `/----k/i`, optimization was totally turned off. Make it possible to use the characters before `k` (i.e. `----`) for optimization. https://github.com/k-takata/Onigmo/commit/9c13de8d0684ebde97e3709d7693997c81ca374b --- regcomp.c | 67 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/regcomp.c b/regcomp.c index e389e6f1209af5..27878405cd24b6 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4217,7 +4217,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, { OnigDistance i, len; int clen, flen, n, j, k; - UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; OnigEncoding enc = reg->enc; @@ -4299,7 +4299,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, { OnigDistance i, len; int clen, flen, n, j, k; - UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; OnigEncoding enc = reg->enc; @@ -4307,6 +4307,34 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1); + if (ignore_case) { + for (i = 0; i < len; i += clen) { + p = s + i; + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf); + if (flen != clen) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } + } + } +endcheck: + ; + } + + len = end - s; n = 0; for (i = 0; i < len; i += clen) { p = s + i; @@ -4317,17 +4345,11 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, if (p + clen > end) clen = (int )(end - p); - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } for (j = 0; j < clen; j++) { skip[s[i + j]] = (UChar )(len - i - j); for (k = 0; k < n; k++) { - skip[buf[k][j]] = (UChar )(len - i - j); + ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf); + skip[buf[j]] = (UChar )(len - i - j); } } } @@ -4369,7 +4391,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } # endif } - return 0; + return (int)len; } #endif /* USE_SUNDAY_QUICK_SEARCH */ @@ -5342,7 +5364,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) static int set_optimize_exact_info(regex_t* reg, OptExactInfo* e) { - int r; int allow_reverse; if (e->len == 0) return 0; @@ -5357,15 +5378,18 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->ignore_case > 0) { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg, + e->len = set_bm_skip(reg->exact, reg->exact_end, reg, reg->map, &(reg->int_map), 1); - if (r == 0) { + reg->exact_end = reg->exact + e->len; + if (e->len >= 3) { reg->optimize = (allow_reverse != 0 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC); } - else { + else if (e->len > 0) { reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } + else + return 0; } else { reg->optimize = ONIG_OPTIMIZE_EXACT_IC; @@ -5373,15 +5397,10 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) } else { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg, - reg->map, &(reg->int_map), 0); - if (r == 0) { - reg->optimize = (allow_reverse != 0 - ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); - } - else { - reg->optimize = ONIG_OPTIMIZE_EXACT; - } + set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, &(reg->int_map), 0); + reg->optimize = (allow_reverse != 0 + ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); } else { reg->optimize = ONIG_OPTIMIZE_EXACT; From bcea1129f4cac556befd3aa6f0f5580a9f0b1eb5 Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Thu, 24 Jan 2019 19:05:57 +0900 Subject: [PATCH 2/7] Revert "[tune] implicit-anchor optimization" This reverts commit 282338f88a8bf0807a7a1d21b06f78abe9de8fac. It seems that the commit didn't improve the performance. Revert it to fix k-takata/Onigmo#100. https://github.com/k-takata/Onigmo/commit/cef834cb3a6e278fa252f52b704c65175a970ac0 --- regcomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regcomp.c b/regcomp.c index 27878405cd24b6..12ad5d79a65571 100644 --- a/regcomp.c +++ b/regcomp.c @@ -5254,7 +5254,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) r = optimize_node_left(qn->target, &nopt, env); if (r) break; - if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) { + if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { if (env->mmd.max == 0 && NTYPE(qn->target) == NT_CANY && qn->greedy) { if (IS_MULTILINE(env->options)) From bfbbcf34557b0aad4f5ed045df3e4e86b11c9a8c Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Fri, 25 Jan 2019 18:54:41 +0900 Subject: [PATCH 3/7] [Bug #13671] Fix that "ss" in look-behind causes syntax error Fixes k-takata/Onigmo#92. This fix was ported from oniguruma: https://github.com/kkos/oniguruma/commit/257082dac8c6019198b56324012f0bd1830ff4ba https://github.com/k-takata/Onigmo/commit/b1a5445fbeba97b3e94a733c2ce11c033453af73 --- regcomp.c | 37 ++++++++++++++++++------------- spec/ruby/language/regexp_spec.rb | 2 +- test/ruby/test_regexp.rb | 23 +++++++++++++++++++ 3 files changed, 46 insertions(+), 16 deletions(-) diff --git a/regcomp.c b/regcomp.c index 12ad5d79a65571..22dbe5f9d50b6e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3301,6 +3301,14 @@ setup_subexp_call(Node* node, ScanEnv* env) } #endif +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) +#define IN_CALL (1<<4) +#define IN_RECCALL (1<<5) +#define IN_LOOK_BEHIND (1<<6) + /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) (? (?s; end = sn->end; if (start >= end) return 0; + is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + r = 0; top_root = root = prev_node = snode = NULL_NODE; alt_num = 1; @@ -3630,7 +3643,7 @@ expand_case_fold_string(Node* node, regex_t* reg) len = enclen(reg->enc, p, end); varlen = is_case_fold_variable_len(n, items, len); - if (n == 0 || varlen == 0) { + if (n == 0 || varlen == 0 || is_in_look_behind) { if (IS_NULL(snode)) { if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { onig_node_free(top_root); @@ -3889,13 +3902,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) } #endif -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) -#define IN_VAR_REPEAT (1<<3) -#define IN_CALL (1<<4) -#define IN_RECCALL (1<<5) - /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) 2. expand ignore-case in char class. @@ -3937,7 +3943,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) case NT_STR: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg); + r = expand_case_fold_string(node, reg, state); } break; @@ -4180,7 +4186,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, state, env); + r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } @@ -4193,7 +4199,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, (state | IN_NOT), env); + r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND), + env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } diff --git a/spec/ruby/language/regexp_spec.rb b/spec/ruby/language/regexp_spec.rb index 0cd9584549b801..dbf341b19ea526 100644 --- a/spec/ruby/language/regexp_spec.rb +++ b/spec/ruby/language/regexp_spec.rb @@ -112,7 +112,7 @@ /foo.(?<=\d)/.match("fooA foo1").to_a.should == ["foo1"] end - ruby_bug "#13671", ""..."3.6" do # https://bugs.ruby-lang.org/issues/13671 + ruby_bug "#13671", ""..."3.5" do # https://bugs.ruby-lang.org/issues/13671 it "handles a lookbehind with ss characters" do r = Regexp.new("(? Date: Fri, 25 Jan 2019 18:56:31 +0900 Subject: [PATCH 4/7] Fix lgtm.com warnings * Multiplication result may overflow 'int' before it is converted to 'OnigDistance'. * Comparison is always true because code <= 122. * This statement makes ExprStmt unreachable. * Empty block without comment https://github.com/k-takata/Onigmo/commit/387ad616c3cb9370f99d2b11198c2135fa07030f --- enc/unicode.c | 2 +- regcomp.c | 9 +++------ regexec.c | 28 ++++++---------------------- 3 files changed, 10 insertions(+), 29 deletions(-) diff --git a/enc/unicode.c b/enc/unicode.c index cbfc6cdf589681..1c4428a3beb4b9 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -682,7 +682,7 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, *pp += codepoint_length; if (code <= 'z') { /* ASCII comes first */ - if (code >= 'a' && code <= 'z') { + if (code >= 'a' /*&& code <= 'z'*/) { if (flags & ONIGENC_CASE_UPCASE) { MODIFIED; if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') diff --git a/regcomp.c b/regcomp.c index 22dbe5f9d50b6e..664806f085ee0e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -2803,14 +2803,11 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case NT_STR: { StrNode* sn = NSTR(node); - if (sn->end <= sn->s) break; - if (exact != 0 && - !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - } - else { + if (exact == 0 || + NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) { n = node; } } @@ -5078,7 +5075,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (NSTRING_IS_DONT_GET_OPT_INFO(node)) { int n = onigenc_strlen(env->enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n; + max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n; } else { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, diff --git a/regexec.c b/regexec.c index b8d174ec8ed472..e5e5f948a97bb1 100644 --- a/regexec.c +++ b/regexec.c @@ -2742,7 +2742,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, /* default behavior: return first-matching result. */ goto finish; - NEXT; CASE(OP_EXACT1) MOP_IN(OP_EXACT1); DATA_ENSURE(1); @@ -3316,40 +3315,36 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) { MOP_OUT; JUMP; - } + } } goto fail; - NEXT; CASE(OP_ASCII_WORD_BEGIN) MOP_IN(OP_ASCII_WORD_BEGIN); if (DATA_ENSURE_CHECK1 && ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { MOP_OUT; JUMP; - } + } } goto fail; - NEXT; CASE(OP_WORD_END) MOP_IN(OP_WORD_END); if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) { if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { MOP_OUT; JUMP; - } + } } goto fail; - NEXT; CASE(OP_ASCII_WORD_END) MOP_IN(OP_ASCII_WORD_END); if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_ASCII_WORD(encode, sprev, end)) { if (ON_STR_END(s) || !ONIGENC_IS_MBC_ASCII_WORD(encode, s, end)) { MOP_OUT; JUMP; - } + } } goto fail; - NEXT; #endif CASE(OP_BEGIN_BUF) MOP_IN(OP_BEGIN_BUF); @@ -3379,10 +3374,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif && !ON_STR_END(s)) { MOP_OUT; - JUMP; + JUMP; } goto fail; - NEXT; CASE(OP_END_LINE) MOP_IN(OP_END_LINE); if (ON_STR_END(s)) { @@ -3398,10 +3392,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 1)) { MOP_OUT; - JUMP; + JUMP; } goto fail; - NEXT; CASE(OP_SEMI_END_BUF) MOP_IN(OP_SEMI_END_BUF); if (ON_STR_END(s)) { @@ -3433,7 +3426,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif } goto fail; - NEXT; CASE(OP_BEGIN_POSITION) MOP_IN(OP_BEGIN_POSITION); if (s != msa->gpos) @@ -3499,12 +3491,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE(OP_BACKREF1) MOP_IN(OP_BACKREF1); mem = 1; goto backref; - NEXT; CASE(OP_BACKREF2) MOP_IN(OP_BACKREF2); mem = 2; goto backref; - NEXT; CASE(OP_BACKREFN) MOP_IN(OP_BACKREFN); GET_MEMNUM_INC(mem, p); @@ -3934,7 +3924,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_GET_REPEAT(mem, stkp); si = GET_STACK_INDEX(stkp); goto repeat_inc; - NEXT; CASE(OP_REPEAT_INC_NG) MOP_IN(OP_REPEAT_INC_NG); GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ @@ -3970,7 +3959,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_GET_REPEAT(mem, stkp); si = GET_STACK_INDEX(stkp); goto repeat_inc_ng; - NEXT; CASE(OP_PUSH_POS) MOP_IN(OP_PUSH_POS); STACK_PUSH_POS(s, sprev, pkeep); @@ -3995,7 +3983,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE(OP_FAIL_POS) MOP_IN(OP_FAIL_POS); STACK_POP_TIL_POS_NOT; goto fail; - NEXT; CASE(OP_PUSH_STOP_BT) MOP_IN(OP_PUSH_STOP_BT); STACK_PUSH_STOP_BT; @@ -4036,7 +4023,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE(OP_FAIL_LOOK_BEHIND_NOT) MOP_IN(OP_FAIL_LOOK_BEHIND_NOT); STACK_POP_TIL_LOOK_BEHIND_NOT; goto fail; - NEXT; CASE(OP_PUSH_ABSENT_POS) MOP_IN(OP_PUSH_ABSENT_POS); /* Save the absent-start-pos and the original end-pos. */ @@ -4098,7 +4084,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #endif STACK_POP_TIL_ABSENT; goto fail; - NEXT; #ifdef USE_SUBEXP_CALL CASE(OP_CALL) MOP_IN(OP_CALL); @@ -4128,7 +4113,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE(OP_FINISH) goto finish; - NEXT; CASE(OP_FAIL) if (0) { From 54b963956b65f8333886e6afe4fb6d73e250148f Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Fri, 25 Jan 2019 18:58:59 +0900 Subject: [PATCH 5/7] Avoid negative character Better fix for k-takata/Onigmo#107. https://github.com/k-takata/Onigmo/commit/85393e4a63223b538529e7095255ce1153c09cff --- enc/unicode.c | 6 ++---- regenc.c | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/enc/unicode.c b/enc/unicode.c index 1c4428a3beb4b9..07497cdbe46731 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -687,10 +687,8 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, MODIFIED; if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') code = I_WITH_DOT_ABOVE; - else { - code -= 'a'; - code += 'A'; - } + else + code -= 'a' - 'A'; } } else if (code >= 'A' && code <= 'Z') { diff --git a/regenc.c b/regenc.c index 0afdf22cb7bdc1..823aacc28e615b 100644 --- a/regenc.c +++ b/regenc.c @@ -984,7 +984,7 @@ onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - code += 'A' - 'a'; + code -= 'a' - 'A'; } else if (code >= 'A' && code <= 'Z' && (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) { @@ -1013,7 +1013,7 @@ onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - code += 'A' - 'a'; + code -= 'a' - 'A'; } else if (code >= 'A' && code <= 'Z' && (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) { From a89246834d8d2b60ece05b5ee34a012973b53df6 Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Mon, 28 Jan 2019 18:52:39 +0900 Subject: [PATCH 6/7] Fix initialization of the table for quick search This fixes k-takata/Onigmo#120. The commit k-takata/Onigmo@9c13de8d0684ebde97e3709d7693997c81ca374b was insufficient. https://github.com/k-takata/Onigmo/commit/1de602ddff140d91419e3f86dd35c81d7bd2d8e7 --- regcomp.c | 4 ++-- test/ruby/test_regexp.rb | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/regcomp.c b/regcomp.c index 664806f085ee0e..3b738b1a673292 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4309,8 +4309,6 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1); - if (ignore_case) { for (i = 0; i < len; i += clen) { p = s + i; @@ -4339,6 +4337,8 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } len = end - s; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + skip[i] = (UChar )(len + 1); n = 0; for (i = 0; i < len; i += clen) { p = s + i; diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index b69c148305f37e..2d7a67dd549c61 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -1743,6 +1743,10 @@ def test_conditional_expression assert_raise(RegexpError, bug12418){ Regexp.new('(0?0|(?(5)||)|(?(5)||))?') } end + def test_quick_search + assert_match_at('(?i) *TOOKY', 'Mozilla/5.0 (Linux; Android 4.0.3; TOOKY', [[34, 40]]) # Issue #120 + end + def test_ss_in_look_behind assert_match_at("(?i:ss)", "ss", [[0, 2]]) assert_match_at("(?i:ss)", "Ss", [[0, 2]]) From ba74fcd6e1079dd4c8482c56675ffa9d20fed7ac Mon Sep 17 00:00:00 2001 From: "K.Takata" Date: Tue, 29 Jan 2019 18:59:19 +0900 Subject: [PATCH 7/7] Remove old code for BMH search Remove the code for Boyer-Moore-Horspool search. Now we are using Sunday's quick search. https://github.com/k-takata/Onigmo/commit/3d9072419a1578b742a422412d004fd8a54209fd --- regcomp.c | 84 --------------------- regexec.c | 214 ------------------------------------------------------ regint.h | 1 - 3 files changed, 299 deletions(-) diff --git a/regcomp.c b/regcomp.c index 3b738b1a673292..0ecf162556b777 100644 --- a/regcomp.c +++ b/regcomp.c @@ -4213,89 +4213,6 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -#ifndef USE_SUNDAY_QUICK_SEARCH -/* set skip map for Boyer-Moore search */ -static int -set_bm_skip(UChar* s, UChar* end, regex_t* reg, - UChar skip[], int** int_skip, int ignore_case) -{ - OnigDistance i, len; - int clen, flen, n, j, k; - UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - OnigEncoding enc = reg->enc; - - len = end - s; - if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len; - - n = 0; - for (i = 0; i < len - 1; i += clen) { - p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); - clen = enclen(enc, p, end); - if (p + clen > end) - clen = (int )(end - p); - - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - skip[s[i + j]] = (UChar )(len - 1 - i - j); - for (k = 0; k < n; k++) { - skip[buf[k][j]] = (UChar )(len - 1 - i - j); - } - } - } - } - else { -# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE - /* This should not happen. */ - return ONIGERR_TYPE_BUG; -# else - if (IS_NULL(*int_skip)) { - *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); - if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; - } - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len; - - n = 0; - for (i = 0; i < len - 1; i += clen) { - p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); - clen = enclen(enc, p, end); - if (p + clen > end) - clen = (int )(end - p); - - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - (*int_skip)[s[i + j]] = (int )(len - 1 - i - j); - for (k = 0; k < n; k++) { - (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j); - } - } - } -# endif - } - return 0; -} - -#else /* USE_SUNDAY_QUICK_SEARCH */ - /* set skip map for Sunday's quick search */ static int set_bm_skip(UChar* s, UChar* end, regex_t* reg, @@ -4397,7 +4314,6 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } return (int)len; } -#endif /* USE_SUNDAY_QUICK_SEARCH */ typedef struct { OnigDistance min; /* min byte length */ diff --git a/regexec.c b/regexec.c index e5e5f948a97bb1..eec3e236631805 100644 --- a/regexec.c +++ b/regexec.c @@ -4377,219 +4377,6 @@ slow_search_backward_ic(OnigEncoding enc, int case_fold_flag, return (UChar* )NULL; } -#ifndef USE_SUNDAY_QUICK_SEARCH -/* Boyer-Moore-Horspool search applied to a multibyte string */ -static UChar* -bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *t, *p, *end; - const UChar *tail; - ptrdiff_t skip, tlen1; - -# ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", - (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); -# endif - - tail = target_end - 1; - tlen1 = tail - target; - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - s = text; - - if (IS_NULL(reg->int_map)) { - while (s < end) { - p = se = s + tlen1; - t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; - p--; t--; - } - skip = reg->map[*se]; - t = s; - do { - s += enclen(reg->enc, s, end); - } while ((s - t) < skip && s < end); - } - } - else { -# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE - while (s < end) { - p = se = s + tlen1; - t = tail; - while (*p == *t) { - if (t == target) return (UChar* )s; - p--; t--; - } - skip = reg->int_map[*se]; - t = s; - do { - s += enclen(reg->enc, s, end); - } while ((s - t) < skip && s < end); - } -# endif - } - - return (UChar* )NULL; -} - -/* Boyer-Moore-Horspool search */ -static UChar* -bm_search(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) -{ - const UChar *s, *t, *p, *end; - const UChar *tail; - -# ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", - (uintptr_t )text, text, (uintptr_t )text_end, text_end, (uintptr_t )text_range, text_range); -# endif - - end = text_range + (target_end - target) - 1; - if (end > text_end) - end = text_end; - - tail = target_end - 1; - s = text + (target_end - target) - 1; - if (IS_NULL(reg->int_map)) { - while (s < end) { - p = s; - t = tail; -# ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_loop: pos: %"PRIdPTR" %s\n", - (intptr_t )(s - text), s); -# endif - while (*p == *t) { - if (t == target) return (UChar* )p; - p--; t--; - } - s += reg->map[*s]; - } - } - else { /* see int_map[] */ -# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE - while (s < end) { - p = s; - t = tail; - while (*p == *t) { - if (t == target) return (UChar* )p; - p--; t--; - } - s += reg->int_map[*s]; - } -# endif - } - return (UChar* )NULL; -} - -/* Boyer-Moore-Horspool search applied to a multibyte string (ignore case) */ -static UChar* -bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, - const UChar* text_range) -{ - const UChar *s, *se, *t, *end; - const UChar *tail; - ptrdiff_t skip, tlen1; - OnigEncoding enc = reg->enc; - int case_fold_flag = reg->case_fold_flag; - -# ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); -# endif - - tail = target_end - 1; - tlen1 = tail - target; - end = text_range; - if (end + tlen1 > text_end) - end = text_end - tlen1; - - s = text; - - if (IS_NULL(reg->int_map)) { - while (s < end) { - se = s + tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, se + 1)) - return (UChar* )s; - skip = reg->map[*se]; - t = s; - do { - s += enclen(reg->enc, s, end); - } while ((s - t) < skip && s < end); - } - } - else { -# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE - while (s < end) { - se = s + tlen1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - s, se + 1)) - return (UChar* )s; - skip = reg->int_map[*se]; - t = s; - do { - s += enclen(reg->enc, s, end); - } while ((s - t) < skip && s < end); - } -# endif - } - - return (UChar* )NULL; -} - -/* Boyer-Moore-Horspool search (ignore case) */ -static UChar* -bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, - const UChar* text, const UChar* text_end, const UChar* text_range) -{ - const UChar *s, *p, *end; - const UChar *tail; - OnigEncoding enc = reg->enc; - int case_fold_flag = reg->case_fold_flag; - -# ifdef ONIG_DEBUG_SEARCH - fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", - (int )text, text, (int )text_end, text_end, (int )text_range, text_range); -# endif - - end = text_range + (target_end - target) - 1; - if (end > text_end) - end = text_end; - - tail = target_end - 1; - s = text + (target_end - target) - 1; - if (IS_NULL(reg->int_map)) { - while (s < end) { - p = s - (target_end - target) + 1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - p, s + 1)) - return (UChar* )p; - s += reg->map[*s]; - } - } - else { /* see int_map[] */ -# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE - while (s < end) { - p = s - (target_end - target) + 1; - if (str_lower_case_match(enc, case_fold_flag, target, target_end, - p, s + 1)) - return (UChar* )p; - s += reg->int_map[*s]; - } -# endif - } - return (UChar* )NULL; -} - -#else /* USE_SUNDAY_QUICK_SEARCH */ - /* Sunday's quick search applied to a multibyte string */ static UChar* bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, @@ -4808,7 +4595,6 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, } return (UChar* )NULL; } -#endif /* USE_SUNDAY_QUICK_SEARCH */ #ifdef USE_INT_MAP_BACKWARD static int diff --git a/regint.h b/regint.h index 75abfba235790c..9924e5f62ab5f3 100644 --- a/regint.h +++ b/regint.h @@ -86,7 +86,6 @@ /* #define USE_OP_PUSH_OR_JUMP_EXACT */ #define USE_QTFR_PEEK_NEXT #define USE_ST_LIBRARY -#define USE_SUNDAY_QUICK_SEARCH #define INIT_MATCH_STACK_SIZE 160 #define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */