Skip to content

Commit ee15883

Browse files
ggerganovorca-zhang
authored andcommitted
speculative : update default params (ggml-org#11954)
* speculative : update default params * speculative : do not discard the last drafted token
1 parent e5ea58c commit ee15883

File tree

4 files changed

+9
-9
lines changed

4 files changed

+9
-9
lines changed

common/common.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,10 @@ struct common_params_speculative {
178178

179179
int32_t n_ctx = 0; // draft context size
180180
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
181-
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
181+
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
182182
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
183183
float p_split = 0.1f; // speculative decoding split probability
184-
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
184+
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
185185

186186
struct cpu_params cpuparams;
187187
struct cpu_params cpuparams_batch;

common/speculative.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
252252
// add drafted token for each sequence
253253
const llama_token id = cur_p->data[0].id;
254254

255-
// only collect very high-confidence draft tokens
256-
if (cur_p->data[0].p < params.p_min) {
257-
break;
258-
}
259-
260255
common_sampler_accept(smpl, id, true);
261256

262257
result.push_back(id);
@@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
265260
break;
266261
}
267262

263+
// only collect very high-confidence draft tokens
264+
if (cur_p->data[0].p < params.p_min) {
265+
break;
266+
}
267+
268268
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
269269

270270
// evaluate the drafted tokens on the draft model

common/speculative.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ struct common_speculative_params {
99
int n_draft = 16; // max drafted tokens
1010
int n_reuse = 256;
1111

12-
float p_min = 0.9f; // min probability required to accept a token in the draft
12+
float p_min = 0.75f; // min probability required to accept a token in the draft
1313
};
1414

1515
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);

examples/server/server.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ struct server_task {
274274
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
275275

276276
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
277-
params.speculative.n_min = std::max(params.speculative.n_min, 2);
277+
params.speculative.n_min = std::max(params.speculative.n_min, 0);
278278
params.speculative.n_max = std::max(params.speculative.n_max, 0);
279279

280280
// Use OpenAI API logprobs only if n_probs wasn't provided

0 commit comments

Comments
 (0)