From aef611299613001a527ce5c368f8a90fa57f4a0c Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Mon, 15 Apr 2024 02:46:06 +0000 Subject: [PATCH 1/5] Removed fallback for lm_head op Signed-off-by: Cheng Penghui --- .../transformers/llm/quantization/utils.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index bbf38d7fdd7..a1911fc325b 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -108,10 +108,6 @@ def replace_linear( device="cpu", empty_weights=False, ): - if modules_to_not_convert is None: - # output_layer is chatglm last layer name - # embed_out is dolly_v2 last layer name - modules_to_not_convert = ["lm_head", "output_layer", "embed_out"] if quantization_config.llm_int8_skip_modules: modules_to_not_convert = modules_to_not_convert.extend( quantization_config.llm_int8_skip_modules @@ -517,17 +513,6 @@ def default_calib_func(model): }, }, }, - op_name_dict={ - ".*lm_head": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*output_layer": { # re.match - "weight": {"dtype": "fp32"}, - }, - ".*embed_out": { # re.match - "weight": {"dtype": "fp32"}, - }, - }, recipes=recipes, ) # TEQ: set calib_func=None, use default training func as calib_func From 0cbaa5066579641eb30bb36af3c3128db3ef1821 Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Mon, 15 Apr 2024 07:04:44 +0000 Subject: [PATCH 2/5] Fixed load issue Signed-off-by: Cheng Penghui --- .../transformers/llm/quantization/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/intel_extension_for_transformers/transformers/llm/quantization/utils.py b/intel_extension_for_transformers/transformers/llm/quantization/utils.py index a1911fc325b..d1a77e4c9cb 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/utils.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/utils.py @@ -108,6 +108,8 @@ def replace_linear( device="cpu", empty_weights=False, ): + if modules_to_not_convert is None: + modules_to_not_convert = [] if quantization_config.llm_int8_skip_modules: modules_to_not_convert = modules_to_not_convert.extend( quantization_config.llm_int8_skip_modules From 78a7fc1959838d779a900e656b5921f79ff14849 Mon Sep 17 00:00:00 2001 From: Cheng Penghui Date: Mon, 15 Apr 2024 08:31:15 +0000 Subject: [PATCH 3/5] update_script --- .../text-generation/quantization/run_generation_gpu_woq.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index b18d7f3888d..6f6f546d137 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -139,11 +139,7 @@ user_model = None # tokenizer -if config.model_type == "llama": - from transformers import LlamaTokenizer - tokenizer = LlamaTokenizer.from_pretrained(args.model) -else: - tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) +tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) quantization_config = None if args.woq: From 57a65ec42e0a37c07f35c1d4984eb011906ef932 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Tue, 16 Apr 2024 19:13:25 +0800 Subject: [PATCH 4/5] Update run_generation_gpu_woq.py Signed-off-by: Meng, Hengyu --- .../text-generation/quantization/run_generation_gpu_woq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py index 6f6f546d137..54f5b4f25d8 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py +++ b/examples/huggingface/pytorch/text-generation/quantization/run_generation_gpu_woq.py @@ -249,7 +249,9 @@ dtype=amp_dtype if amp_enabled else None, ): for i in range(num_iter + num_warmup): - with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof: + # workaround for Windows + # with torch.autograd.profiler_legacy.profile(enabled=args.do_profiling, use_xpu=(args.device=="xpu"), record_shapes=False) as prof: + if True input_ids = tokenizer( prompt, return_tensors="pt").input_ids.to(args.device) tic = time.time() From 1f5c68dce50393fd10e67c09a69d8d761fa20ac3 Mon Sep 17 00:00:00 2001 From: "Meng, Hengyu" Date: Tue, 16 Apr 2024 19:24:16 +0800 Subject: [PATCH 5/5] Update requirements_GPU.txt Signed-off-by: Meng, Hengyu --- .../pytorch/text-generation/quantization/requirements_GPU.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt index e8a3879a086..c363f19061e 100644 --- a/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt +++ b/examples/huggingface/pytorch/text-generation/quantization/requirements_GPU.txt @@ -5,7 +5,7 @@ protobuf sentencepiece != 0.1.92 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ torch==2.1.0a0 -transformers +transformers==4.35 optimum-intel bitsandbytes #baichuan transformers_stream_generator