Skip to content

Commit 7c8f0dc

Browse files
committed
support longcat-image-edit
1 parent 148120b commit 7c8f0dc

File tree

1 file changed

+126
-54
lines changed

1 file changed

+126
-54
lines changed

conditioner.hpp

Lines changed: 126 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1720,7 +1720,7 @@ struct LLMEmbedder : public Conditioner {
17201720
std::string current_part;
17211721

17221722
for (char c : curr_text) {
1723-
if (c == '\'') {
1723+
if (c == '"') {
17241724
if (!current_part.empty()) {
17251725
parts.push_back(current_part);
17261726
current_part.clear();
@@ -1741,7 +1741,7 @@ struct LLMEmbedder : public Conditioner {
17411741
for (const auto& part : parts) {
17421742
if (part.empty())
17431743
continue;
1744-
if (part[0] == '\'' && part.back() == '\'') {
1744+
if (part[0] == '"' && part.back() == '"') {
17451745
std::string quoted_content = part.substr(1, part.size() - 2);
17461746
for (char ch : quoted_content) {
17471747
std::string char_str(1, ch);
@@ -1778,68 +1778,140 @@ struct LLMEmbedder : public Conditioner {
17781778
bool spell_quotes = false;
17791779
std::set<int> out_layers;
17801780
if (llm->enable_vision && conditioner_params.ref_images.size() > 0) {
1781-
LOG_INFO("QwenImageEditPlusPipeline");
1782-
prompt_template_encode_start_idx = 64;
1783-
int image_embed_idx = 64 + 6;
1784-
1785-
int min_pixels = 384 * 384;
1786-
int max_pixels = 560 * 560;
1787-
std::string placeholder = "<|image_pad|>";
1788-
std::string img_prompt;
1789-
1790-
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
1791-
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
1792-
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
1793-
int height = image.height;
1794-
int width = image.width;
1795-
int h_bar = static_cast<int>(std::round(height / factor)) * factor;
1796-
int w_bar = static_cast<int>(std::round(width / factor)) * factor;
1797-
1798-
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
1799-
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
1800-
h_bar = std::max(static_cast<int>(factor),
1801-
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
1802-
w_bar = std::max(static_cast<int>(factor),
1803-
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
1804-
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
1805-
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
1806-
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
1807-
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
1781+
if (sd_version_is_longcat(version)) {
1782+
LOG_INFO("LongCatEditPipeline");
1783+
prompt_template_encode_start_idx = 67;
1784+
// prompt_template_encode_end_idx = 5;
1785+
int image_embed_idx = 36 + 6;
1786+
1787+
int min_pixels = 384 * 384;
1788+
int max_pixels = 560 * 560;
1789+
std::string placeholder = "<|image_pad|>";
1790+
std::string img_prompt;
1791+
1792+
1793+
// Only one image is officicially supported by the model, not sure how it handles multiple images
1794+
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
1795+
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
1796+
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
1797+
int height = image.height;
1798+
int width = image.width;
1799+
int h_bar = static_cast<int>(std::round(height / factor)) * factor;
1800+
int w_bar = static_cast<int>(std::round(width / factor)) * factor;
1801+
1802+
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
1803+
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
1804+
h_bar = std::max(static_cast<int>(factor),
1805+
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
1806+
w_bar = std::max(static_cast<int>(factor),
1807+
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
1808+
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
1809+
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
1810+
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
1811+
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
1812+
}
1813+
1814+
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
1815+
1816+
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
1817+
free(image.data);
1818+
image.data = nullptr;
1819+
1820+
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
1821+
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
1822+
free(resized_image.data);
1823+
resized_image.data = nullptr;
1824+
1825+
ggml_tensor* image_embed = nullptr;
1826+
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
1827+
image_embeds.emplace_back(image_embed_idx, image_embed);
1828+
image_embed_idx += 1 + image_embed->ne[1] + 6;
1829+
1830+
img_prompt += "<|vision_start|>";
1831+
int64_t num_image_tokens = image_embed->ne[1];
1832+
img_prompt.reserve(num_image_tokens * placeholder.size());
1833+
for (int j = 0; j < num_image_tokens; j++) {
1834+
img_prompt += placeholder;
1835+
}
1836+
img_prompt += "<|vision_end|>";
18081837
}
18091838

1810-
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
1839+
max_length = 512;
1840+
pad = true;
1841+
spell_quotes = true;
1842+
prompt = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n";
1843+
prompt += img_prompt;
1844+
1845+
prompt_attn_range.first = static_cast<int>(prompt.size());
1846+
prompt += conditioner_params.text;
1847+
prompt_attn_range.second = static_cast<int>(prompt.size());
1848+
1849+
prompt += "<|im_end|>\n<|im_start|>assistant\n";
1850+
1851+
} else {
1852+
LOG_INFO("QwenImageEditPlusPipeline");
1853+
prompt_template_encode_start_idx = 64;
1854+
int image_embed_idx = 64 + 6;
1855+
1856+
int min_pixels = 384 * 384;
1857+
int max_pixels = 560 * 560;
1858+
std::string placeholder = "<|image_pad|>";
1859+
std::string img_prompt;
1860+
1861+
for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
1862+
sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
1863+
double factor = llm->params.vision.patch_size * llm->params.vision.spatial_merge_size;
1864+
int height = image.height;
1865+
int width = image.width;
1866+
int h_bar = static_cast<int>(std::round(height / factor)) * factor;
1867+
int w_bar = static_cast<int>(std::round(width / factor)) * factor;
1868+
1869+
if (static_cast<double>(h_bar) * w_bar > max_pixels) {
1870+
double beta = std::sqrt((height * width) / static_cast<double>(max_pixels));
1871+
h_bar = std::max(static_cast<int>(factor),
1872+
static_cast<int>(std::floor(height / beta / factor)) * static_cast<int>(factor));
1873+
w_bar = std::max(static_cast<int>(factor),
1874+
static_cast<int>(std::floor(width / beta / factor)) * static_cast<int>(factor));
1875+
} else if (static_cast<double>(h_bar) * w_bar < min_pixels) {
1876+
double beta = std::sqrt(static_cast<double>(min_pixels) / (height * width));
1877+
h_bar = static_cast<int>(std::ceil(height * beta / factor)) * static_cast<int>(factor);
1878+
w_bar = static_cast<int>(std::ceil(width * beta / factor)) * static_cast<int>(factor);
1879+
}
1880+
1881+
LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
18111882

1812-
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
1813-
free(image.data);
1814-
image.data = nullptr;
1883+
sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
1884+
free(image.data);
1885+
image.data = nullptr;
18151886

1816-
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
1817-
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
1818-
free(resized_image.data);
1819-
resized_image.data = nullptr;
1887+
ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
1888+
sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
1889+
free(resized_image.data);
1890+
resized_image.data = nullptr;
18201891

1821-
ggml_tensor* image_embed = nullptr;
1822-
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
1823-
image_embeds.emplace_back(image_embed_idx, image_embed);
1824-
image_embed_idx += 1 + image_embed->ne[1] + 6;
1892+
ggml_tensor* image_embed = nullptr;
1893+
llm->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
1894+
image_embeds.emplace_back(image_embed_idx, image_embed);
1895+
image_embed_idx += 1 + image_embed->ne[1] + 6;
18251896

1826-
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
1827-
int64_t num_image_tokens = image_embed->ne[1];
1828-
img_prompt.reserve(num_image_tokens * placeholder.size());
1829-
for (int j = 0; j < num_image_tokens; j++) {
1830-
img_prompt += placeholder;
1897+
img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
1898+
int64_t num_image_tokens = image_embed->ne[1];
1899+
img_prompt.reserve(num_image_tokens * placeholder.size());
1900+
for (int j = 0; j < num_image_tokens; j++) {
1901+
img_prompt += placeholder;
1902+
}
1903+
img_prompt += "<|vision_end|>";
18311904
}
1832-
img_prompt += "<|vision_end|>";
1833-
}
18341905

1835-
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
1836-
prompt += img_prompt;
1906+
prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
1907+
prompt += img_prompt;
18371908

1838-
prompt_attn_range.first = static_cast<int>(prompt.size());
1839-
prompt += conditioner_params.text;
1840-
prompt_attn_range.second = static_cast<int>(prompt.size());
1909+
prompt_attn_range.first = static_cast<int>(prompt.size());
1910+
prompt += conditioner_params.text;
1911+
prompt_attn_range.second = static_cast<int>(prompt.size());
18411912

1842-
prompt += "<|im_end|>\n<|im_start|>assistant\n";
1913+
prompt += "<|im_end|>\n<|im_start|>assistant\n";
1914+
}
18431915
} else if (sd_version_is_flux2(version)) {
18441916
prompt_template_encode_start_idx = 0;
18451917
out_layers = {10, 20, 30};

0 commit comments

Comments
 (0)