diff --git a/CMakeLists.txt b/CMakeLists.txt
index 782a893e4..a3ec60db1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
+option(SD_SHOW_REMAINING_TIME        "sd: show remaining and average sampling time" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
 if(SD_CUDA)
@@ -93,6 +94,11 @@ else()
     add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
 endif()
 
+if (SD_SHOW_REMAINING_TIME)
+    message("-- Display remaining and average sampling time")
+    add_definitions(-DSD_SHOW_REMAINING_TIME)
+endif ()
+
 if(SD_SYCL)
     message("-- Use SYCL as backend stable-diffusion")
     set(GGML_SYCL ON)
diff --git a/denoiser.hpp b/denoiser.hpp
index 66799109d..263b574e9 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -468,6 +468,20 @@ struct FluxFlowDenoiser : public Denoiser {
 
 typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
 
+static inline void show_step(int i0, int im, int64_t* t0) {
+#ifdef SD_SHOW_REMAINING_TIME
+    int i = i0 + 1;
+    float t1 = (ggml_time_us() - *t0) / 1000000.f / i;
+    pretty_progress(i, im, t1, t1 * (im - i));
+//    LOG_INFO("step %d sampling completed taking %.2fs", i, (t1 - *t0) * 1.0f / 1000000 / i);
+#else  // SD_SHOW_REMAINING_TIME
+    int64_t t1 = ggml_time_us();
+    pretty_progress(i0 + 1, im, (t1 - *t0) / 1000000.f);
+//    LOG_INFO("step %d sampling completed taking %.2fs", i0 + 1, (t1 - *t0) * 1.0f / 1000000);
+    *t0 = t1;
+#endif  // SD_SHOW_REMAINING_TIME
+}
+
 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
 static void sample_k_diffusion(sample_method_t method,
                                denoise_cb_t model,
@@ -477,6 +491,8 @@ static void sample_k_diffusion(sample_method_t method,
                                std::shared_ptr<RNG> rng,
                                float eta) {
     size_t steps = sigmas.size() - 1;
+    int64_t t0 = ggml_time_us();
+
     // sample_euler_ancestral
     switch (method) {
         case EULER_A: {
@@ -530,6 +546,7 @@ static void sample_k_diffusion(sample_method_t method,
                         }
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case EULER:  // Implemented without any sigma churn
@@ -563,6 +580,7 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_x[j] = vec_x[j] + vec_d[j] * dt;
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case HEUN: {
@@ -613,6 +631,7 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_x[j] = vec_x[j] + vec_d[j] * dt;
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case DPM2: {
@@ -664,6 +683,7 @@ static void sample_k_diffusion(sample_method_t method,
                         vec_x[j] = vec_x[j] + d2 * dt_2;
                     }
                 }
+                show_step(i, steps, &t0);
             }
 
         } break;
@@ -738,6 +758,7 @@ static void sample_k_diffusion(sample_method_t method,
                         }
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
@@ -777,6 +798,7 @@ static void sample_k_diffusion(sample_method_t method,
                 for (int j = 0; j < ggml_nelements(x); j++) {
                     vec_old_denoised[j] = vec_denoised[j];
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
@@ -820,6 +842,7 @@ static void sample_k_diffusion(sample_method_t method,
                 for (int j = 0; j < ggml_nelements(x); j++) {
                     vec_old_denoised[j] = vec_denoised[j];
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case IPNDM:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
@@ -895,6 +918,7 @@ static void sample_k_diffusion(sample_method_t method,
                 } else {
                     buffer_model.push_back(d_cur);
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case IPNDM_V:  // iPNDM_v sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
@@ -969,6 +993,7 @@ static void sample_k_diffusion(sample_method_t method,
 
                 // Prepare the next d tensor
                 d_cur = ggml_dup_tensor(work_ctx, x_next);
+                show_step(i, steps, &t0);
             }
         } break;
         case LCM:  // Latent Consistency Models
@@ -1004,6 +1029,7 @@ static void sample_k_diffusion(sample_method_t method,
                         }
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
         case DDIM_TRAILING:  // Denoising Diffusion Implicit Models
@@ -1198,6 +1224,7 @@ static void sample_k_diffusion(sample_method_t method,
                 // needs to be prescaled again, since k-diffusion's
                 // model() differes from the bare U-net F_theta by the
                 // factor c_in.
+                show_step(i, steps, &t0);
             }
         } break;
         case TCD:  // Strategic Stochastic Sampling (Algorithm 4) in
@@ -1372,6 +1399,7 @@ static void sample_k_diffusion(sample_method_t method,
                             vec_noise[j];
                     }
                 }
+                show_step(i, steps, &t0);
             }
         } break;
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index a2d33bca2..0c21d0076 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -851,7 +851,6 @@ class StableDiffusionGGML {
             if (step == 1) {
                 pretty_progress(0, (int)steps, 0);
             }
-            int64_t t0 = ggml_time_us();
 
             std::vector<float> scaling = denoiser->get_scalings(sigma);
             GGML_ASSERT(scaling.size() == 3);
@@ -970,11 +969,6 @@ class StableDiffusionGGML {
                 // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
                 vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
             }
-            int64_t t1 = ggml_time_us();
-            if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
-            }
             if (noise_mask != nullptr) {
                 for (int64_t x = 0; x < denoised->ne[0]; x++) {
                     for (int64_t y = 0; y < denoised->ne[1]; y++) {
diff --git a/util.cpp b/util.cpp
index da11a14d6..459e133a4 100644
--- a/util.cpp
+++ b/util.cpp
@@ -370,6 +370,49 @@ void pretty_progress(int step, int steps, float time) {
     }
 }
 
+#ifdef SD_SHOW_REMAINING_TIME
+void pretty_progress(int step, int steps, float time, float left) {
+    if (sd_progress_cb) {
+        sd_progress_cb(step, steps, time, sd_progress_cb_data);
+        return;
+    }
+    if (step == 0) {
+        return;
+    }
+    std::string progress = "  |";
+    int max_progress     = 50;
+    int32_t current      = (int32_t)(step * 1.f * max_progress / steps);
+    for (int i = 0; i < 50; i++) {
+        if (i > current) {
+            progress += " ";
+        } else if (i == current && i != max_progress - 1) {
+            progress += ">";
+        } else {
+            progress += "=";
+        }
+    }
+    progress += "|";
+    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
+           progress.c_str(), step, steps,
+           time > 1.0f || time == 0 ? time : (1.0f / time));
+    if (left >= 60.0f) {
+        /* same number of spaces and backspaces */
+        printf(", %.0fm %.2fs left         \b\b\b\b\b\b\b\b\b",
+               /* min appears faster than mul+div for me, 19.31s vs 19.34s average */
+               floor(left / 60.0f), std::min(59.99f, fmod(left, 60.0f)));
+               //floor(left / 60.0f), floor(fmod(left, 60.0f) * 100.0f) / 100.0f);
+    } else if (left > 0) {
+        printf(", %.2fs left               \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", left);
+    } else {
+        printf("                           \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b");
+    }
+    fflush(stdout);  // for linux
+    if (step == steps) {
+        printf("\n");
+    }
+}
+#endif  // SD_SHOW_REMAINING_TIME
+
 std::string ltrim(const std::string& s) {
     auto it = std::find_if(s.begin(), s.end(), [](int ch) {
         return !std::isspace(ch);
@@ -699,4 +742,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
     }
 
     return res;
-}
\ No newline at end of file
+}
diff --git a/util.h b/util.h
index 14fa812e5..344159e0b 100644
--- a/util.h
+++ b/util.h
@@ -47,6 +47,9 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
 std::string path_join(const std::string& p1, const std::string& p2);
 std::vector<std::string> splitString(const std::string& str, char delimiter);
 void pretty_progress(int step, int steps, float time);
+#ifdef SD_SHOW_REMAINING_TIME
+void pretty_progress(int step, int steps, float time, float left);
+#endif  // SD_SHOW_REMAINING_TIME
 
 void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);