14
14
if TYPE_CHECKING :
15
15
import numpy as np
16
16
import numpy .typing as npt
17
- import torch
18
17
import xgrammar as xgr
19
18
20
19
from vllm .v1 .request import Request
27
26
class StructuredOutputManager :
28
27
29
28
def __init__ (self , vllm_config : VllmConfig ):
30
- tokenizer_group = init_tokenizer_from_configs (
31
- model_config = vllm_config .model_config ,
32
- scheduler_config = vllm_config .scheduler_config ,
33
- parallel_config = vllm_config .parallel_config ,
34
- lora_config = vllm_config .lora_config ) # type: ignore[arg-type]
35
- tokenizer_group .ping ()
36
29
self .vocab_size = vllm_config .model_config .get_vocab_size ()
37
30
self .vllm_config = vllm_config
31
+ self .init_complete = False
32
+
33
+ def _delayed_init (self ):
34
+ """Initialization delayed until we know it is needed."""
35
+ tokenizer_group = init_tokenizer_from_configs (
36
+ model_config = self .vllm_config .model_config ,
37
+ scheduler_config = self .vllm_config .scheduler_config ,
38
+ parallel_config = self .vllm_config .parallel_config ,
39
+ lora_config = self .vllm_config .lora_config ) # type: ignore[arg-type]
40
+ tokenizer_group .ping ()
38
41
39
42
tokenizer = tokenizer_group .get_lora_tokenizer (None )
40
43
tokenizer_info = xgr .TokenizerInfo .from_huggingface (
@@ -47,12 +50,21 @@ def __init__(self, vllm_config: VllmConfig):
47
50
# compilation, so we set it to half the number of CPUs.
48
51
max_workers = max (1 , (multiprocessing .cpu_count () + 1 ) // 2 )
49
52
self .executor = ThreadPoolExecutor (max_workers = max_workers )
50
- self ._grammar_bitmask : Optional [torch .Tensor ] = None
53
+ self ._grammar_bitmask = xgr .allocate_token_bitmask (
54
+ self .vllm_config .scheduler_config .max_num_seqs , self .vocab_size )
55
+
56
+ self .init_complete = True
51
57
52
58
def grammar_init (self , request : Request ) -> None :
53
59
if request .structured_output_request is None :
54
60
return
55
61
62
+ # The first time this is called, we need to finish initialization
63
+ # of xgrammar. We defer it to avoid the import of xgrammar and
64
+ # initialization cost if it is not going to be used.
65
+ if not self .init_complete :
66
+ self ._delayed_init ()
67
+
56
68
grammar : Future [Grammar ] = self .executor .submit (
57
69
self ._async_create_grammar , request )
58
70
request .structured_output_request .grammar = grammar # type: ignore[assignment]
@@ -100,11 +112,6 @@ def grammar_bitmask(
100
112
if not structured_output_request_ids :
101
113
return None
102
114
103
- if self ._grammar_bitmask is None :
104
- self ._grammar_bitmask = xgr .allocate_token_bitmask (
105
- self .vllm_config .scheduler_config .max_num_seqs ,
106
- self .vocab_size )
107
-
108
115
# Fill the bitmask using the index of each request equal to its
109
116
# position in the batch. Resize the bitmask down to the size of
110
117
# the batch.
0 commit comments