Skip to content

Commit d6d5a68

Browse files
committed
Split quotes by utf8 characters rather than individual char
1 parent 1b56d19 commit d6d5a68

File tree

1 file changed

+45
-42
lines changed

1 file changed

+45
-42
lines changed

conditioner.hpp

Lines changed: 45 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,6 +1678,19 @@ struct LLMEmbedder : public Conditioner {
16781678
}
16791679
}
16801680

1681+
size_t get_utf8_char_len(char c) {
1682+
unsigned char uc = static_cast<unsigned char>(c);
1683+
if ((uc & 0x80) == 0)
1684+
return 1; // ASCII (1 byte)
1685+
if ((uc & 0xE0) == 0xC0)
1686+
return 2; // 2-byte char
1687+
if ((uc & 0xF0) == 0xE0)
1688+
return 3; // 3-byte char (Common for Chinese/Japanese)
1689+
if ((uc & 0xF8) == 0xF0)
1690+
return 4; // 4-byte char (Emojis, etc.)
1691+
return 1; // Fallback (should not happen in valid UTF-8)
1692+
}
1693+
16811694
std::tuple<std::vector<int>, std::vector<float>> tokenize(
16821695
std::string text,
16831696
std::pair<int, int> attn_range,
@@ -1697,16 +1710,6 @@ struct LLMEmbedder : public Conditioner {
16971710
}
16981711
parsed_attention.emplace_back(text.substr(attn_range.second), 1.f);
16991712

1700-
// {
1701-
// std::stringstream ss;
1702-
// ss << '[';
1703-
// for (const auto& item : parsed_attention) {
1704-
// ss << "['" << item.first << "', " << item.second << "], ";
1705-
// }
1706-
// ss << ']';
1707-
// LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
1708-
// }
1709-
17101713
std::vector<int> tokens;
17111714
std::vector<float> weights;
17121715

@@ -1715,46 +1718,47 @@ struct LLMEmbedder : public Conditioner {
17151718
float curr_weight = item.second;
17161719

17171720
if (spell_quotes) {
1718-
std::vector<std::string> parts;
1721+
std::string buffer;
17191722
bool in_quote = false;
1720-
std::string current_part;
17211723

1722-
for (char c : curr_text) {
1723-
if (c == '"') {
1724-
if (!current_part.empty()) {
1725-
parts.push_back(current_part);
1726-
current_part.clear();
1724+
size_t i = 0;
1725+
while (i < curr_text.size()) {
1726+
// utf8 character can be 1-4 char
1727+
size_t char_len = get_utf8_char_len(curr_text[i]);
1728+
1729+
// Safety check to prevent reading past end of string
1730+
if (i + char_len > curr_text.size()) {
1731+
char_len = curr_text.size() - i;
1732+
}
1733+
std::string uchar = curr_text.substr(i, char_len);
1734+
i += char_len;
1735+
1736+
if (uchar == "\"") {
1737+
buffer += uchar;
1738+
// If we were accumulating normal text, flush it now
1739+
if (!in_quote) {
1740+
std::vector<int> part_tokens = tokenizer->tokenize(buffer, nullptr);
1741+
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1742+
weights.insert(weights.end(), part_tokens.size(), curr_weight);
1743+
buffer.clear();
17271744
}
17281745
in_quote = !in_quote;
17291746
} else {
1730-
current_part += c;
1731-
if (in_quote && current_part.size() == 1) {
1732-
parts.push_back(current_part);
1733-
current_part.clear();
1734-
}
1735-
}
1736-
}
1737-
if (!current_part.empty()) {
1738-
parts.push_back(current_part);
1739-
}
1740-
1741-
for (const auto& part : parts) {
1742-
if (part.empty())
1743-
continue;
1744-
if (part[0] == '"' && part.back() == '"') {
1745-
std::string quoted_content = part.substr(1, part.size() - 2);
1746-
for (char ch : quoted_content) {
1747-
std::string char_str(1, ch);
1748-
std::vector<int> char_tokens = tokenizer->tokenize(char_str, nullptr);
1747+
if (in_quote) {
1748+
std::vector<int> char_tokens = tokenizer->tokenize(uchar, nullptr);
17491749
tokens.insert(tokens.end(), char_tokens.begin(), char_tokens.end());
17501750
weights.insert(weights.end(), char_tokens.size(), curr_weight);
1751+
} else {
1752+
buffer += uchar;
17511753
}
1752-
} else {
1753-
std::vector<int> part_tokens = tokenizer->tokenize(part, nullptr);
1754-
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1755-
weights.insert(weights.end(), part_tokens.size(), curr_weight);
17561754
}
17571755
}
1756+
1757+
if (!buffer.empty()) {
1758+
std::vector<int> part_tokens = tokenizer->tokenize(buffer, nullptr);
1759+
tokens.insert(tokens.end(), part_tokens.begin(), part_tokens.end());
1760+
weights.insert(weights.end(), part_tokens.size(), curr_weight);
1761+
}
17581762
} else {
17591763
std::vector<int> curr_tokens = tokenizer->tokenize(curr_text, nullptr);
17601764
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
@@ -1782,14 +1786,13 @@ struct LLMEmbedder : public Conditioner {
17821786
LOG_INFO("LongCatEditPipeline");
17831787
prompt_template_encode_start_idx = 67;
17841788
// prompt_template_encode_end_idx = 5;
1785-
int image_embed_idx = 36 + 6;
1789+
int image_embed_idx = 36 + 6;
17861790

17871791
int min_pixels = 384 * 384;
17881792
int max_pixels = 560 * 560;
17891793
std::string placeholder = "<|image_pad|>";
17901794
std::string img_prompt;
17911795

1792-
17931796
// Only one image is officicially supported by the model, not sure how it handles multiple images
17941797
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
17951798
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);

0 commit comments

Comments
 (0)