Skip to content

Commit a98c8bf

Browse files
authored
MONGOCRYPT-755 Implement StrEncode (#928)
1 parent 90476d5 commit a98c8bf

11 files changed

+1389
-0
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ set (MONGOCRYPT_SOURCES
120120
src/mc-range-encoding.c
121121
src/mc-rangeopts.c
122122
src/mc-reader.c
123+
src/mc-str-encode-string-sets.c
124+
src/mc-text-search-str-encode.c
123125
src/mc-tokens.c
124126
src/mc-writer.c
125127
src/mongocrypt-binary.c
@@ -474,6 +476,7 @@ set (TEST_MONGOCRYPT_SOURCES
474476
test/test-mc-range-mincover.c
475477
test/test-mc-rangeopts.c
476478
test/test-mc-reader.c
479+
test/test-mc-text-search-str-encode.c
477480
test/test-mc-tokens.c
478481
test/test-mc-range-encoding.c
479482
test/test-mc-writer.c

src/mc-fle2-encryption-placeholder-private.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,61 @@ bool mc_FLE2RangeInsertSpec_parse(mc_FLE2RangeInsertSpec_t *out,
119119
bool use_range_v2,
120120
mongocrypt_status_t *status);
121121

122+
// Note: For the substring/suffix/prefix insert specs, all lengths are in terms of number of UTF-8 codepoints, not
123+
// number of bytes.
124+
typedef struct {
125+
// mlen is the max string length that can be indexed.
126+
uint32_t mlen;
127+
// lb is the lower bound on the length of substrings to be indexed.
128+
uint32_t lb;
129+
// ub is the upper bound on the length of substrings to be indexed.
130+
uint32_t ub;
131+
} mc_FLE2SubstringInsertSpec_t;
132+
133+
typedef struct {
134+
// lb is the lower bound on the length of suffixes to be indexed.
135+
uint32_t lb;
136+
// ub is the upper bound on the length of suffixes to be indexed.
137+
uint32_t ub;
138+
} mc_FLE2SuffixInsertSpec_t;
139+
140+
typedef struct {
141+
// lb is the lower bound on the length of prefixes to be indexed.
142+
uint32_t lb;
143+
// ub is the upper bound on the length of prefixes to be indexed.
144+
uint32_t ub;
145+
} mc_FLE2PrefixInsertSpec_t;
146+
147+
typedef struct {
148+
// v is the value to encrypt.
149+
const char *v;
150+
// len is the byte length of v.
151+
uint32_t len;
152+
153+
// substr is the spec for substring indexing.
154+
struct {
155+
mc_FLE2SubstringInsertSpec_t value;
156+
bool set;
157+
} substr;
158+
159+
// suffix is the spec for suffix indexing.
160+
struct {
161+
mc_FLE2SuffixInsertSpec_t value;
162+
bool set;
163+
} suffix;
164+
165+
// prefix is the spec for prefix indexing.
166+
struct {
167+
mc_FLE2PrefixInsertSpec_t value;
168+
bool set;
169+
} prefix;
170+
171+
// casef indicates if case folding is enabled.
172+
bool casef;
173+
// diacf indicates if diacritic folding is enabled.
174+
bool diacf;
175+
} mc_FLE2TextSearchInsertSpec_t;
176+
122177
/** FLE2EncryptionPlaceholder implements Encryption BinData (subtype 6)
123178
* sub-subtype 0, the intent-to-encrypt mapping. Contains a value to encrypt and
124179
* a description of how it should be encrypted.
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/*
2+
* Copyright 2024-present MongoDB, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#ifndef MONGOCRYPT_STR_ENCODE_STRING_SETS_PRIVATE_H
18+
#define MONGOCRYPT_STR_ENCODE_STRING_SETS_PRIVATE_H
19+
20+
#include "mongocrypt-buffer-private.h"
21+
#include "mongocrypt.h"
22+
23+
// Represents a valid unicode string with the bad character 0xFF appended to the end. This is our base string which
24+
// we build substring trees on. Stores all the valid code points in the string, plus one code point for 0xFF.
25+
// Exposed for testing.
26+
typedef struct {
27+
_mongocrypt_buffer_t buf;
28+
uint32_t *codepoint_offsets;
29+
uint32_t codepoint_len;
30+
} mc_utf8_string_with_bad_char_t;
31+
32+
// Initialize by copying buffer into data and adding the bad character.
33+
mc_utf8_string_with_bad_char_t *mc_utf8_string_with_bad_char_from_buffer(const char *buf, uint32_t len);
34+
35+
void mc_utf8_string_with_bad_char_destroy(mc_utf8_string_with_bad_char_t *utf8);
36+
37+
// Set of affixes of a shared base string. Does not do any duplicate prevention.
38+
typedef struct _mc_affix_set_t mc_affix_set_t;
39+
40+
// Initialize affix set from base string and number of entries (this must be known as a prior).
41+
mc_affix_set_t *mc_affix_set_new(const mc_utf8_string_with_bad_char_t *base_string, uint32_t n_indices);
42+
43+
void mc_affix_set_destroy(mc_affix_set_t *set);
44+
45+
// Insert affix into set. base_start/end_idx are codepoint indices. base_end_idx is exclusive. Returns true if
46+
// inserted, false otherwise.
47+
bool mc_affix_set_insert(mc_affix_set_t *set, uint32_t base_start_idx, uint32_t base_end_idx);
48+
49+
// Insert the base string count times into the set. Treated as a special case, since this is the only affix that
50+
// will appear multiple times. Returns true if inserted, false otherwise.
51+
bool mc_affix_set_insert_base_string(mc_affix_set_t *set, uint32_t count);
52+
53+
// Iterator on affix set.
54+
typedef struct {
55+
mc_affix_set_t *set;
56+
uint32_t cur_idx;
57+
} mc_affix_set_iter_t;
58+
59+
// Point the iterator to the first affix of the given set.
60+
void mc_affix_set_iter_init(mc_affix_set_iter_t *it, mc_affix_set_t *set);
61+
62+
// Get the next affix, its length in bytes, and its count. Returns false if the set does not have a next element, true
63+
// otherwise.
64+
bool mc_affix_set_iter_next(mc_affix_set_iter_t *it, const char **str, uint32_t *byte_len, uint32_t *count);
65+
66+
// Set of substrings of a shared base string. Prevents duplicates.
67+
typedef struct _mc_substring_set_t mc_substring_set_t;
68+
69+
mc_substring_set_t *mc_substring_set_new(const mc_utf8_string_with_bad_char_t *base_string);
70+
71+
void mc_substring_set_destroy(mc_substring_set_t *set);
72+
73+
// Insert the base string count times into the set. Treated as a special case, since this is the only substring that
74+
// will appear multiple times. Always inserts successfully.
75+
void mc_substring_set_increment_fake_string(mc_substring_set_t *set, uint32_t count);
76+
77+
// Insert substring into set. base_start/end_idx are codepoint indices. base_end_idx is exclusive. Returns true if
78+
// inserted, false otherwise.
79+
bool mc_substring_set_insert(mc_substring_set_t *set, uint32_t base_start_idx, uint32_t base_end_idx);
80+
81+
// Iterator on substring set.
82+
typedef struct {
83+
mc_substring_set_t *set;
84+
void *cur_node;
85+
uint32_t cur_idx;
86+
} mc_substring_set_iter_t;
87+
88+
// Point the iterator to the first substring of the given set.
89+
void mc_substring_set_iter_init(mc_substring_set_iter_t *it, mc_substring_set_t *set);
90+
91+
// Get the next substring, its length in bytes, and its count. Returns false if the set does not have a next element,
92+
// true otherwise.
93+
bool mc_substring_set_iter_next(mc_substring_set_iter_t *it, const char **str, uint32_t *byte_len, uint32_t *count);
94+
95+
#endif

0 commit comments

Comments
 (0)