refactor guidance params in lib

stduhpf · stduhpf · commit 102a9eabafb1 · 2025-02-12T01:04:40.000+01:00
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -948,11 +948,12 @@ int main(int argc, const char* argv[]) {
                           params.style_ratio,
                           params.normalize_input,
                           params.input_id_images_path.c_str(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end);
+                          sd_slg_params_t{params.skip_layers.data(),
+                                          params.skip_layers.size(),
+                                          params.slg_scale,
+                                          params.skip_layer_start,
+                                          params.skip_layer_end},
+                          sd_apg_params_t{1, 0, 0});
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1016,11 +1017,12 @@ int main(int argc, const char* argv[]) {
                               params.style_ratio,
                               params.normalize_input,
                               params.input_id_images_path.c_str(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end);
+                              sd_slg_params_t{params.skip_layers.data(),
+                                              params.skip_layers.size(),
+                                              params.slg_scale,
+                                              params.skip_layer_start,
+                                              params.skip_layer_end},
+                              sd_apg_params_t{1, 0, 0});
         }
     }
 
@@ -1059,19 +1061,19 @@ int main(int argc, const char* argv[]) {
 
     std::string dummy_name, ext, lc_ext;
     bool is_jpg;
-    size_t last = params.output_path.find_last_of(".");
+    size_t last      = params.output_path.find_last_of(".");
     size_t last_path = std::min(params.output_path.find_last_of("/"),
                                 params.output_path.find_last_of("\\"));
-    if (last != std::string::npos // filename has extension
-    && (last_path == std::string::npos || last > last_path)) {
+    if (last != std::string::npos  // filename has extension
+        && (last_path == std::string::npos || last > last_path)) {
         dummy_name = params.output_path.substr(0, last);
         ext = lc_ext = params.output_path.substr(last);
         std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
         is_jpg = lc_ext == ".jpg" || lc_ext == ".jpeg" || lc_ext == ".jpe";
     } else {
         dummy_name = params.output_path;
         ext = lc_ext = "";
-        is_jpg = false;
+        is_jpg       = false;
     }
     // appending ".png" to absent or unknown extension
     if (!is_jpg && lc_ext != ".png") {
@@ -1083,7 +1085,7 @@ int main(int argc, const char* argv[]) {
             continue;
         }
         std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
-        if(is_jpg) {
+        if (is_jpg) {
             stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                            results[i].data, 90, get_image_params(params, params.seed + i).c_str());
             printf("save result JPEG image to '%s'\n", final_image_path.c_str());
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -796,11 +796,11 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        std::vector<int> skip_layers = {},
-                        float slg_scale              = 0,
-                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
+                        sd_apg_params_t apg_params = {1, 0, 0},
+                        ggml_tensor* noise_mask  = nullptr) {
+        std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);
+
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -823,7 +823,7 @@ class StableDiffusionGGML {
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
 
         bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
+        bool has_skiplayer     = slg_params.scale != 0.0 && skip_layers.size() > 0;
 
         // denoise wrapper
         struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
@@ -843,13 +843,8 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
-        // TODO do not hardcode
-        float apg_eta           = .08f;
-        float apg_momentum      = -.5f;
-        float apg_norm_treshold = 15.0f;
-
         std::vector<float> apg_momentum_buffer;
-        if (apg_momentum != 0)
+        if (apg_params.momentum != 0)
             apg_momentum_buffer.resize((size_t)ggml_nelements(denoised));
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -932,7 +927,7 @@ class StableDiffusionGGML {
             }
 
             int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
+            bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count);
             float* skip_layer_data = NULL;
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
@@ -966,37 +961,37 @@ class StableDiffusionGGML {
             float dot              = 0;
             for (int i = 0; i < ne_elements; i++) {
                 float delta = positive_data[i] - negative_data[i];
-                if (apg_momentum != 0) {
-                    delta += apg_momentum * apg_momentum_buffer[i];
+                if (apg_params.momentum != 0) {
+                    delta += apg_params.momentum * apg_momentum_buffer[i];
                     apg_momentum_buffer[i] = delta;
                 }
-                if (apg_norm_treshold > 0) {
+                if (apg_params.norm_treshold > 0) {
                     diff_norm += delta * delta;
                 }
-                if (apg_eta != 1.0f) {
+                if (apg_params.eta != 1.0f) {
                     cond_norm_sq += positive_data[i] * positive_data[i];
                     dot += positive_data[i] * delta;
                 }
                 deltas[i] = delta;
             }
-            if (apg_norm_treshold > 0) {
+            if (apg_params.norm_treshold > 0) {
                 diff_norm        = std::sqrtf(diff_norm);
-                apg_scale_factor = std::min(1.0f, apg_norm_treshold / diff_norm);
+                apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
             }
-            if (apg_eta != 1.0f) {
+            if (apg_params.eta != 1.0f) {
                 dot *= apg_scale_factor;
                 // pre-normalize (avoids one square root and ne_elements extra divs)
                 dot /= cond_norm_sq;
             }
 
             for (int i = 0; i < ne_elements; i++) {
                 deltas[i] *= apg_scale_factor;
-                if (apg_eta != 1.0f) {
+                if (apg_params.eta != 1.0f) {
                     float apg_parallel   = dot * positive_data[i];
                     float apg_orthogonal = deltas[i] - apg_parallel;
 
                     // tweak deltas
-                    deltas[i] = apg_orthogonal + apg_eta * apg_parallel;
+                    deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
                 }
             }
 
@@ -1015,7 +1010,7 @@ class StableDiffusionGGML {
                     }
                 }
                 if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
+                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale;
                 }
                 // v = latent_result, eps = latent_result
                 // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
@@ -1260,11 +1255,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float style_ratio,
                            bool normalize_input,
                            std::string input_id_images_path,
-                           std::vector<int> skip_layers = {},
-                           float slg_scale              = 0,
-                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL) {
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params,
+                           ggml_tensor* masked_image = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1516,10 +1509,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      sigmas,
                                                      start_merge_step,
                                                      id_cond,
-                                                     skip_layers,
-                                                     slg_scale,
-                                                     skip_layer_start,
-                                                     skip_layer_end,
+                                                     slg_params,
+                                                     apg_params,
                                                      noise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
@@ -1588,12 +1579,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     float style_ratio,
                     bool normalize_input,
                     const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    sd_slg_params_t slg_params,
+                    sd_apg_params_t apg_params) {
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1666,10 +1653,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                style_ratio,
                                                normalize_input,
                                                input_id_images_path_c_str,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end);
+                                               slg_params,
+                                               apg_params);
 
     size_t t1 = ggml_time_ms();
 
@@ -1698,12 +1683,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     float style_ratio,
                     bool normalize_input,
                     const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    sd_slg_params_t slg_params,
+                    sd_apg_params_t apg_params) {
     LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1844,10 +1825,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                style_ratio,
                                                normalize_input,
                                                input_id_images_path_c_str,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
+                                               slg_params,
+                                               apg_params,
                                                masked_image);
 
     size_t t2 = ggml_time_ms();
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -122,6 +122,20 @@ typedef struct {
     uint8_t* data;
 } sd_image_t;
 
+typedef struct {
+    float eta;
+    float momentum;
+    float norm_treshold;
+} sd_apg_params_t;
+
+typedef struct {
+    int* skip_layers;
+    size_t skip_layers_count;
+    float scale;
+    float skip_layer_start;
+    float skip_layer_end;
+} sd_slg_params_t;
+
 typedef struct sd_ctx_t sd_ctx_t;
 
 SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
@@ -166,11 +180,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                            float style_strength,
                            bool normalize_input,
                            const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params);
 
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
@@ -192,11 +203,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            float style_strength,
                            bool normalize_input,
                            const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params);
 
 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,