@@ -387,3 +387,178 @@ python3 vllm/benchmarks/benchmark_throughput.py \
387
387
--enable-lora \
388
388
--lora-path yard1/llama-2-7b-sql-lora-test
389
389
```
390
+
391
+ ---
392
+ ## Example - Structured Output Benchmark
393
+
394
+ Benchmark the performance of structured output generation (JSON, grammar, regex).
395
+
396
+ ### Server Setup
397
+
398
+ ``` bash
399
+ vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
400
+ ```
401
+
402
+ ### JSON Schema Benchmark
403
+
404
+ ``` bash
405
+ python3 benchmarks/benchmark_serving_structured_output.py \
406
+ --backend vllm \
407
+ --model NousResearch/Hermes-3-Llama-3.1-8B \
408
+ --dataset json \
409
+ --structured-output-ratio 1.0 \
410
+ --request-rate 10 \
411
+ --num-prompts 1000
412
+ ```
413
+
414
+ ### Grammar-based Generation Benchmark
415
+
416
+ ``` bash
417
+ python3 benchmarks/benchmark_serving_structured_output.py \
418
+ --backend vllm \
419
+ --model NousResearch/Hermes-3-Llama-3.1-8B \
420
+ --dataset grammar \
421
+ --structure-type grammar \
422
+ --request-rate 10 \
423
+ --num-prompts 1000
424
+ ```
425
+
426
+ ### Regex-based Generation Benchmark
427
+
428
+ ``` bash
429
+ python3 benchmarks/benchmark_serving_structured_output.py \
430
+ --backend vllm \
431
+ --model NousResearch/Hermes-3-Llama-3.1-8B \
432
+ --dataset regex \
433
+ --request-rate 10 \
434
+ --num-prompts 1000
435
+ ```
436
+
437
+ ### Choice-based Generation Benchmark
438
+
439
+ ``` bash
440
+ python3 benchmarks/benchmark_serving_structured_output.py \
441
+ --backend vllm \
442
+ --model NousResearch/Hermes-3-Llama-3.1-8B \
443
+ --dataset choice \
444
+ --request-rate 10 \
445
+ --num-prompts 1000
446
+ ```
447
+
448
+ ### XGrammar Benchmark Dataset
449
+
450
+ ``` bash
451
+ python3 benchmarks/benchmark_serving_structured_output.py \
452
+ --backend vllm \
453
+ --model NousResearch/Hermes-3-Llama-3.1-8B \
454
+ --dataset xgrammar_bench \
455
+ --request-rate 10 \
456
+ --num-prompts 1000
457
+ ```
458
+
459
+ ---
460
+ ## Example - Long Document QA Throughput Benchmark
461
+
462
+ Benchmark the performance of long document question-answering with prefix caching.
463
+
464
+ ### Basic Long Document QA Test
465
+
466
+ ``` bash
467
+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
468
+ --model meta-llama/Llama-2-7b-chat-hf \
469
+ --enable-prefix-caching \
470
+ --num-documents 16 \
471
+ --document-length 2000 \
472
+ --output-len 50 \
473
+ --repeat-count 5
474
+ ```
475
+
476
+ ### Different Repeat Modes
477
+
478
+ ``` bash
479
+ # Random mode (default) - shuffle prompts randomly
480
+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
481
+ --model meta-llama/Llama-2-7b-chat-hf \
482
+ --enable-prefix-caching \
483
+ --num-documents 8 \
484
+ --document-length 3000 \
485
+ --repeat-count 3 \
486
+ --repeat-mode random
487
+
488
+ # Tile mode - repeat entire prompt list in sequence
489
+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
490
+ --model meta-llama/Llama-2-7b-chat-hf \
491
+ --enable-prefix-caching \
492
+ --num-documents 8 \
493
+ --document-length 3000 \
494
+ --repeat-count 3 \
495
+ --repeat-mode tile
496
+
497
+ # Interleave mode - repeat each prompt consecutively
498
+ python3 benchmarks/benchmark_long_document_qa_throughput.py \
499
+ --model meta-llama/Llama-2-7b-chat-hf \
500
+ --enable-prefix-caching \
501
+ --num-documents 8 \
502
+ --document-length 3000 \
503
+ --repeat-count 3 \
504
+ --repeat-mode interleave
505
+ ```
506
+
507
+ ---
508
+ ## Example - Prefix Caching Benchmark
509
+
510
+ Benchmark the efficiency of automatic prefix caching.
511
+
512
+ ### Fixed Prompt with Prefix Caching
513
+
514
+ ``` bash
515
+ python3 benchmarks/benchmark_prefix_caching.py \
516
+ --model meta-llama/Llama-2-7b-chat-hf \
517
+ --enable-prefix-caching \
518
+ --num-prompts 1 \
519
+ --repeat-count 100 \
520
+ --input-length-range 128:256
521
+ ```
522
+
523
+ ### ShareGPT Dataset with Prefix Caching
524
+
525
+ ``` bash
526
+ # download dataset
527
+ # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
528
+
529
+ python3 benchmarks/benchmark_prefix_caching.py \
530
+ --model meta-llama/Llama-2-7b-chat-hf \
531
+ --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
532
+ --enable-prefix-caching \
533
+ --num-prompts 20 \
534
+ --repeat-count 5 \
535
+ --input-length-range 128:256
536
+ ```
537
+
538
+ ---
539
+ ## Example - Request Prioritization Benchmark
540
+
541
+ Benchmark the performance of request prioritization in vLLM.
542
+
543
+ ### Basic Prioritization Test
544
+
545
+ ``` bash
546
+ python3 benchmarks/benchmark_prioritization.py \
547
+ --model meta-llama/Llama-2-7b-chat-hf \
548
+ --input-len 128 \
549
+ --output-len 64 \
550
+ --num-prompts 100 \
551
+ --scheduling-policy priority
552
+ ```
553
+
554
+ ### Multiple Sequences per Prompt
555
+
556
+ ``` bash
557
+ python3 benchmark_prioritization.py \
558
+ --model meta-llama/Llama-2-7b-chat-hf \
559
+ --input-len 128 \
560
+ --output-len 64 \
561
+ --num-prompts 100 \
562
+ --scheduling-policy priority \
563
+ --n 2
564
+ ```
0 commit comments