From 7927e1b9cd5703bbc3d8ef956f402c2c3ad71b90 Mon Sep 17 00:00:00 2001 From: Vinay Damodaran Date: Fri, 11 Apr 2025 22:08:56 -0700 Subject: [PATCH] Add warmup client option --- README.md | 5 +++++ docs/openapi.json | 4 ++++ docs/source/en/cli_arguments.md | 5 +++++ router/src/lib.rs | 5 ++++- router/src/main.rs | 6 ++++++ 5 files changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 520ad8d0..6f6472e7 100644 --- a/README.md +++ b/README.md @@ -215,6 +215,11 @@ Options: Unused for gRPC servers [env: AUTO_TRUNCATE=] + + --warmup-model + Send a dummy request to the model before server start-up + + [env: WARMUP_MODEL=] --default-prompt-name The name of the prompt that should be used by default for encoding. If not set, no prompt will be applied. diff --git a/docs/openapi.json b/docs/openapi.json index 35000a1c..7fccdd50 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1058,6 +1058,7 @@ "max_batch_tokens", "max_client_batch_size", "auto_truncate", + "warmup_model", "tokenization_workers", "version" ], @@ -1129,6 +1130,9 @@ "type": "string", "description": "Router Info", "example": "0.5.0" + }, + "warmup_model": { + "type": "boolean" } } }, diff --git a/docs/source/en/cli_arguments.md b/docs/source/en/cli_arguments.md index 0882893c..7ed38fa1 100644 --- a/docs/source/en/cli_arguments.md +++ b/docs/source/en/cli_arguments.md @@ -106,6 +106,11 @@ Options: Unused for gRPC servers [env: AUTO_TRUNCATE=] + + --warmup-model + Send a dummy request to the model before server start-up + + [env: WARMUP_MODEL=] --default-prompt-name The name of the prompt that should be used by default for encoding. If not set, no prompt will be applied. diff --git a/router/src/lib.rs b/router/src/lib.rs index 49e0581d..d116b587 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -52,6 +52,7 @@ pub async fn run( max_batch_requests: Option, max_client_batch_size: usize, auto_truncate: bool, + warmup_model: bool, default_prompt: Option, default_prompt_name: Option, hf_token: Option, @@ -248,7 +249,7 @@ pub async fn run( .await .context("Model backend is not healthy")?; - if !backend.padded_model { + if !backend.padded_model || warmup_model { tracing::info!("Warming up model"); backend .warmup(max_input_length, max_batch_tokens, max_batch_requests) @@ -288,6 +289,7 @@ pub async fn run( max_batch_requests, max_client_batch_size, auto_truncate, + warmup_model, version: env!("CARGO_PKG_VERSION"), sha: option_env!("VERGEN_GIT_SHA"), docker_label: option_env!("DOCKER_LABEL"), @@ -510,6 +512,7 @@ pub struct Info { #[cfg_attr(feature = "http", schema(example = "32"))] pub max_client_batch_size: usize, pub auto_truncate: bool, + pub warmup_model: bool, #[cfg_attr(feature = "http", schema(example = "4"))] pub tokenization_workers: usize, /// Router Info diff --git a/router/src/main.rs b/router/src/main.rs index e4a902d6..241189ea 100644 --- a/router/src/main.rs +++ b/router/src/main.rs @@ -79,6 +79,11 @@ struct Args { #[clap(long, env)] auto_truncate: bool, + /// Send a dummy request to the model before server start-up + /// + #[clap(long, env)] + warmup_model: bool, + /// The name of the prompt that should be used by default for encoding. If not set, no prompt /// will be applied. /// @@ -216,6 +221,7 @@ async fn main() -> Result<()> { args.max_batch_requests, args.max_client_batch_size, args.auto_truncate, + args.warmup_model, args.default_prompt, args.default_prompt_name, token,