diff --git a/CMakeLists.txt b/CMakeLists.txt index 782a893e4..a3ec60db1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) +option(SD_SHOW_REMAINING_TIME "sd: show remaining and average sampling time" OFF) #option(SD_BUILD_SERVER "sd: build server example" ON) if(SD_CUDA) @@ -93,6 +94,11 @@ else() add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES}) endif() +if (SD_SHOW_REMAINING_TIME) + message("-- Display remaining and average sampling time") + add_definitions(-DSD_SHOW_REMAINING_TIME) +endif () + if(SD_SYCL) message("-- Use SYCL as backend stable-diffusion") set(GGML_SYCL ON) diff --git a/denoiser.hpp b/denoiser.hpp index 66799109d..263b574e9 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -468,6 +468,20 @@ struct FluxFlowDenoiser : public Denoiser { typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t; +static inline void show_step(int i0, int im, int64_t* t0) { +#ifdef SD_SHOW_REMAINING_TIME + int i = i0 + 1; + float t1 = (ggml_time_us() - *t0) / 1000000.f / i; + pretty_progress(i, im, t1, t1 * (im - i)); +// LOG_INFO("step %d sampling completed taking %.2fs", i, (t1 - *t0) * 1.0f / 1000000 / i); +#else // SD_SHOW_REMAINING_TIME + int64_t t1 = ggml_time_us(); + pretty_progress(i0 + 1, im, (t1 - *t0) / 1000000.f); +// LOG_INFO("step %d sampling completed taking %.2fs", i0 + 1, (t1 - *t0) * 1.0f / 1000000); + *t0 = t1; +#endif // SD_SHOW_REMAINING_TIME +} + // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t static void sample_k_diffusion(sample_method_t method, denoise_cb_t model, @@ -477,6 +491,8 @@ static void sample_k_diffusion(sample_method_t method, std::shared_ptr<RNG> rng, float eta) { size_t steps = sigmas.size() - 1; + int64_t t0 = ggml_time_us(); + // sample_euler_ancestral switch (method) { case EULER_A: { @@ -530,6 +546,7 @@ static void sample_k_diffusion(sample_method_t method, } } } + show_step(i, steps, &t0); } } break; case EULER: // Implemented without any sigma churn @@ -563,6 +580,7 @@ static void sample_k_diffusion(sample_method_t method, vec_x[j] = vec_x[j] + vec_d[j] * dt; } } + show_step(i, steps, &t0); } } break; case HEUN: { @@ -613,6 +631,7 @@ static void sample_k_diffusion(sample_method_t method, vec_x[j] = vec_x[j] + vec_d[j] * dt; } } + show_step(i, steps, &t0); } } break; case DPM2: { @@ -664,6 +683,7 @@ static void sample_k_diffusion(sample_method_t method, vec_x[j] = vec_x[j] + d2 * dt_2; } } + show_step(i, steps, &t0); } } break; @@ -738,6 +758,7 @@ static void sample_k_diffusion(sample_method_t method, } } } + show_step(i, steps, &t0); } } break; case DPMPP2M: // DPM++ (2M) from Karras et al (2022) @@ -777,6 +798,7 @@ static void sample_k_diffusion(sample_method_t method, for (int j = 0; j < ggml_nelements(x); j++) { vec_old_denoised[j] = vec_denoised[j]; } + show_step(i, steps, &t0); } } break; case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 @@ -820,6 +842,7 @@ static void sample_k_diffusion(sample_method_t method, for (int j = 0; j < ggml_nelements(x); j++) { vec_old_denoised[j] = vec_denoised[j]; } + show_step(i, steps, &t0); } } break; case IPNDM: // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main @@ -895,6 +918,7 @@ static void sample_k_diffusion(sample_method_t method, } else { buffer_model.push_back(d_cur); } + show_step(i, steps, &t0); } } break; case IPNDM_V: // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main @@ -969,6 +993,7 @@ static void sample_k_diffusion(sample_method_t method, // Prepare the next d tensor d_cur = ggml_dup_tensor(work_ctx, x_next); + show_step(i, steps, &t0); } } break; case LCM: // Latent Consistency Models @@ -1004,6 +1029,7 @@ static void sample_k_diffusion(sample_method_t method, } } } + show_step(i, steps, &t0); } } break; case DDIM_TRAILING: // Denoising Diffusion Implicit Models @@ -1198,6 +1224,7 @@ static void sample_k_diffusion(sample_method_t method, // needs to be prescaled again, since k-diffusion's // model() differes from the bare U-net F_theta by the // factor c_in. + show_step(i, steps, &t0); } } break; case TCD: // Strategic Stochastic Sampling (Algorithm 4) in @@ -1372,6 +1399,7 @@ static void sample_k_diffusion(sample_method_t method, vec_noise[j]; } } + show_step(i, steps, &t0); } } break; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index a2d33bca2..0c21d0076 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -851,7 +851,6 @@ class StableDiffusionGGML { if (step == 1) { pretty_progress(0, (int)steps, 0); } - int64_t t0 = ggml_time_us(); std::vector<float> scaling = denoiser->get_scalings(sigma); GGML_ASSERT(scaling.size() == 3); @@ -970,11 +969,6 @@ class StableDiffusionGGML { // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } - int64_t t1 = ggml_time_us(); - if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); - } if (noise_mask != nullptr) { for (int64_t x = 0; x < denoised->ne[0]; x++) { for (int64_t y = 0; y < denoised->ne[1]; y++) { diff --git a/util.cpp b/util.cpp index da11a14d6..459e133a4 100644 --- a/util.cpp +++ b/util.cpp @@ -370,6 +370,49 @@ void pretty_progress(int step, int steps, float time) { } } +#ifdef SD_SHOW_REMAINING_TIME +void pretty_progress(int step, int steps, float time, float left) { + if (sd_progress_cb) { + sd_progress_cb(step, steps, time, sd_progress_cb_data); + return; + } + if (step == 0) { + return; + } + std::string progress = " |"; + int max_progress = 50; + int32_t current = (int32_t)(step * 1.f * max_progress / steps); + for (int i = 0; i < 50; i++) { + if (i > current) { + progress += " "; + } else if (i == current && i != max_progress - 1) { + progress += ">"; + } else { + progress += "="; + } + } + progress += "|"; + printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K", + progress.c_str(), step, steps, + time > 1.0f || time == 0 ? time : (1.0f / time)); + if (left >= 60.0f) { + /* same number of spaces and backspaces */ + printf(", %.0fm %.2fs left \b\b\b\b\b\b\b\b\b", + /* min appears faster than mul+div for me, 19.31s vs 19.34s average */ + floor(left / 60.0f), std::min(59.99f, fmod(left, 60.0f))); + //floor(left / 60.0f), floor(fmod(left, 60.0f) * 100.0f) / 100.0f); + } else if (left > 0) { + printf(", %.2fs left \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", left); + } else { + printf(" \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); + } + fflush(stdout); // for linux + if (step == steps) { + printf("\n"); + } +} +#endif // SD_SHOW_REMAINING_TIME + std::string ltrim(const std::string& s) { auto it = std::find_if(s.begin(), s.end(), [](int ch) { return !std::isspace(ch); @@ -699,4 +742,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str } return res; -} \ No newline at end of file +} diff --git a/util.h b/util.h index 14fa812e5..344159e0b 100644 --- a/util.h +++ b/util.h @@ -47,6 +47,9 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size); std::string path_join(const std::string& p1, const std::string& p2); std::vector<std::string> splitString(const std::string& str, char delimiter); void pretty_progress(int step, int steps, float time); +#ifdef SD_SHOW_REMAINING_TIME +void pretty_progress(int step, int steps, float time, float left); +#endif // SD_SHOW_REMAINING_TIME void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);