diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt index e8a3879a086..c363f19061e 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt @@ -5,7 +5,7 @@ protobuf sentencepiece != 0.1.92 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ torch==2.1.0a0 -transformers +transformers==4.35 optimum-intel bitsandbytes #baichuan transformers_stream_generator diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index b18d7f3888d..54f5b4f25d8 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -139,11 +139,7 @@ user_model = None # tokenizer -if config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(args.model) -else: - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) +tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) quantization_config = None if args.woq: @@ -253,7 +249,9 @@ dtype=amp_dtype if amp_enabled else None, ): for i in range(num_iter + num_warmup): - with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof: + # workaround for Windows + # with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof: + if True input_ids = tokenizer( prompt, return_tensors="pt").input_ids.to(args.device) tic = time.time() diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index bbf38d7fdd7..d1a77e4c9cb 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -109,9 +109,7 @@ def replace_linear( empty_weights=False, ): if modules_to_not_convert is None: - # output_layer is chatglm last layer name - # embed_out is dolly_v2 last layer name - modules_to_not_convert = ["lm_head", "output_layer", "embed_out"] + modules_to_not_convert = [] if quantization_config.llm_int8_skip_modules: modules_to_not_convert = modules_to_not_convert.extend( quantization_config.llm_int8_skip_modules @@ -517,17 +515,6 @@ def default_calib_func(model): }, }, }, - op_name_dict={ - ".*lm_head": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*output_layer": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*embed_out": { # re.match - "weight": {"dtype": "fp32"}, - }, - }, recipes=recipes, ) # TEQ: set calib_func=None, use default training func as calib_func