2424
2525// 16MiB - maximum length in bytes of a string to be encoded.
2626#define MAX_ENCODE_BYTE_LEN 16777216
27+ // Number of bytes which are added to the base string before encryption.
28+ #define OVERHEAD_BYTES 5
2729
2830static mc_affix_set_t * generate_prefix_or_suffix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
29- uint32_t unfolded_codepoint_len ,
31+ uint32_t unfolded_byte_len ,
3032 uint32_t lb ,
3133 uint32_t ub ,
3234 bool is_prefix ) {
3335 BSON_ASSERT_PARAM (base_str );
34- // 16 * ceil(unfolded codepoint len / 16)
35- uint32_t cbclen = 16 * (uint32_t )((unfolded_codepoint_len + 15 ) / 16 );
36- if (cbclen < lb ) {
36+ // We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
37+ uint32_t encrypted_len = 16 * (uint32_t )((unfolded_byte_len + OVERHEAD_BYTES + 15 ) / 16 );
38+ // Max len of a string that has this encrypted len.
39+ uint32_t padded_len = encrypted_len - OVERHEAD_BYTES ;
40+ if (padded_len < lb ) {
3741 // No valid substrings, return empty tree
3842 return NULL ;
3943 }
4044
4145 // Total number of substrings
42- uint32_t msize = BSON_MIN (cbclen , ub ) - lb + 1 ;
46+ uint32_t msize = BSON_MIN (padded_len , ub ) - lb + 1 ;
4347 uint32_t folded_codepoint_len = base_str -> codepoint_len - 1 ; // remove one codepoint for 0xFF
4448 uint32_t real_max_len = BSON_MIN (folded_codepoint_len , ub );
4549 // Number of actual substrings, excluding padding
@@ -67,19 +71,19 @@ static mc_affix_set_t *generate_prefix_or_suffix_tree(const mc_utf8_string_with_
6771}
6872
6973static mc_affix_set_t * generate_suffix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
70- uint32_t unfolded_codepoint_len ,
74+ uint32_t unfolded_byte_len ,
7175 const mc_FLE2SuffixInsertSpec_t * spec ) {
7276 BSON_ASSERT_PARAM (base_str );
7377 BSON_ASSERT_PARAM (spec );
74- return generate_prefix_or_suffix_tree (base_str , unfolded_codepoint_len , spec -> lb , spec -> ub , false);
78+ return generate_prefix_or_suffix_tree (base_str , unfolded_byte_len , spec -> lb , spec -> ub , false);
7579}
7680
7781static mc_affix_set_t * generate_prefix_tree (const mc_utf8_string_with_bad_char_t * base_str ,
78- uint32_t unfolded_codepoint_len ,
82+ uint32_t unfolded_byte_len ,
7983 const mc_FLE2PrefixInsertSpec_t * spec ) {
8084 BSON_ASSERT_PARAM (base_str );
8185 BSON_ASSERT_PARAM (spec );
82- return generate_prefix_or_suffix_tree (base_str , unfolded_codepoint_len , spec -> lb , spec -> ub , true);
86+ return generate_prefix_or_suffix_tree (base_str , unfolded_byte_len , spec -> lb , spec -> ub , true);
8387}
8488
8589static uint32_t calc_number_of_substrings (uint32_t strlen , uint32_t lb , uint32_t ub ) {
@@ -97,13 +101,15 @@ static uint32_t calc_number_of_substrings(uint32_t strlen, uint32_t lb, uint32_t
97101}
98102
99103static mc_substring_set_t * generate_substring_tree (const mc_utf8_string_with_bad_char_t * base_str ,
100- uint32_t unfolded_codepoint_len ,
104+ uint32_t unfolded_byte_len ,
101105 const mc_FLE2SubstringInsertSpec_t * spec ) {
102106 BSON_ASSERT_PARAM (base_str );
103107 BSON_ASSERT_PARAM (spec );
104- // 16 * ceil(unfolded len / 16)
105- uint32_t cbclen = 16 * (uint32_t )((unfolded_codepoint_len + 15 ) / 16 );
106- if (unfolded_codepoint_len > spec -> mlen || cbclen < spec -> lb ) {
108+ // We encrypt (unfolded string + 5 bytes of extra BSON info) with a 16-byte block cipher.
109+ uint32_t encrypted_len = 16 * (uint32_t )((unfolded_byte_len + OVERHEAD_BYTES + 15 ) / 16 );
110+ // Max len of a string that has this encrypted len.
111+ uint32_t padded_len = encrypted_len - OVERHEAD_BYTES ;
112+ if (padded_len < spec -> lb ) {
107113 // No valid substrings, return empty tree
108114 return NULL ;
109115 }
@@ -112,30 +118,30 @@ static mc_substring_set_t *generate_substring_tree(const mc_utf8_string_with_bad
112118 // justifies why that calculation and this calculation are equivalent.
113119 // At this point, it is established that:
114120 // beta <= mlen
115- // lb <= cbclen
121+ // lb <= padded_len
116122 // lb <= ub <= mlen
117123 //
118124 // So, the following formula for msize in the OST paper:
119125 // maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1))
120- // maxkgram_2 = sum_(j=lb, min(ub, cbclen ), (cbclen - j + 1))
126+ // maxkgram_2 = sum_(j=lb, min(ub, padded_len ), (padded_len - j + 1))
121127 // msize = min(maxkgram_1, maxkgram_2)
122128 // can be simplified to:
123- // msize = sum_(j=lb, min(ub, cbclen ), (min(mlen, cbclen ) - j + 1))
129+ // msize = sum_(j=lb, min(ub, padded_len ), (min(mlen, padded_len ) - j + 1))
124130 //
125- // because if cbclen <= ub, then it follows that cbclen <= ub <= mlen, and so
131+ // because if padded_len <= ub, then it follows that padded_len <= ub <= mlen, and so
126132 // maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1)) # as above
127- // maxkgram_2 = sum_(j=lb, cbclen , (cbclen - j + 1)) # less or equal to maxkgram_1
133+ // maxkgram_2 = sum_(j=lb, padded_len , (padded_len - j + 1)) # less or equal to maxkgram_1
128134 // msize = maxkgram_2
129- // and if cbclen > ub, then it follows that:
135+ // and if padded_len > ub, then it follows that:
130136 // maxkgram_1 = sum_(j=lb, ub, (mlen - j + 1)) # as above
131- // maxkgram_2 = sum_(j=lb, ub, (cbclen - j + 1)) # same sum bounds as maxkgram_1
132- // msize = sum_(j=lb, ub, (min(mlen, cbclen ) - j + 1))
137+ // maxkgram_2 = sum_(j=lb, ub, (padded_len - j + 1)) # same sum bounds as maxkgram_1
138+ // msize = sum_(j=lb, ub, (min(mlen, padded_len ) - j + 1))
133139 // in both cases, msize can be rewritten as:
134- // msize = sum_(j=lb, min(ub, cbclen ), (min(mlen, cbclen ) - j + 1))
140+ // msize = sum_(j=lb, min(ub, padded_len ), (min(mlen, padded_len ) - j + 1))
135141
136142 uint32_t folded_codepoint_len = base_str -> codepoint_len - 1 ;
137- // If mlen < cbclen , we only need to pad to mlen
138- uint32_t padded_len = BSON_MIN (spec -> mlen , cbclen );
143+ // If mlen < padded_len , we only need to pad to mlen
144+ padded_len = BSON_MIN (spec -> mlen , padded_len );
139145 // Total number of substrings -- i.e. the number of valid substrings IF the string spanned the full padded length
140146 uint32_t msize = calc_number_of_substrings (padded_len , spec -> lb , spec -> ub );
141147 uint32_t n_real_substrings = 0 ;
@@ -185,11 +191,6 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
185191 CLIENT_ERR ("StrEncode: String passed in was not valid UTF-8" );
186192 return NULL ;
187193 }
188- uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
189- if (unfolded_codepoint_len == 0 ) {
190- // Empty string: We set unfolded length to 1 so that we generate fake tokens.
191- unfolded_codepoint_len = 1 ;
192- }
193194
194195 mc_utf8_string_with_bad_char_t * base_string ;
195196 if (spec -> casef || spec -> diacf ) {
@@ -213,12 +214,13 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
213214 // Base string is the folded string plus the 0xFF character
214215 sets -> base_string = base_string ;
215216 if (spec -> suffix .set ) {
216- sets -> suffix_set = generate_suffix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> suffix .value );
217+ sets -> suffix_set = generate_suffix_tree (sets -> base_string , spec -> len , & spec -> suffix .value );
217218 }
218219 if (spec -> prefix .set ) {
219- sets -> prefix_set = generate_prefix_tree (sets -> base_string , unfolded_codepoint_len , & spec -> prefix .value );
220+ sets -> prefix_set = generate_prefix_tree (sets -> base_string , spec -> len , & spec -> prefix .value );
220221 }
221222 if (spec -> substr .set ) {
223+ uint32_t unfolded_codepoint_len = mc_get_utf8_codepoint_length (spec -> v , spec -> len );
222224 if (unfolded_codepoint_len > spec -> substr .value .mlen ) {
223225 CLIENT_ERR ("StrEncode: String passed in was longer than the maximum length for substring indexing -- "
224226 "String len: %u, max len: %u" ,
@@ -227,7 +229,7 @@ mc_str_encode_sets_t *mc_text_search_str_encode(const mc_FLE2TextSearchInsertSpe
227229 mc_str_encode_sets_destroy (sets );
228230 return NULL ;
229231 }
230- sets -> substring_set = generate_substring_tree (sets -> base_string , unfolded_codepoint_len , & spec -> substr .value );
232+ sets -> substring_set = generate_substring_tree (sets -> base_string , spec -> len , & spec -> substr .value );
231233 }
232234 // Exact string is always equal to the base string up until the bad character
233235 _mongocrypt_buffer_from_data (& sets -> exact , sets -> base_string -> buf .data , (uint32_t )sets -> base_string -> buf .len - 1 );
0 commit comments