fix codellama result issue (wrong prompt format)

jie-jw-wu · jie-jw-wu · commit 4ad96a0f2b5d · 2024-05-02T18:03:06.000-07:00
diff --git a/generate_response.py b/generate_response.py
@@ -783,7 +783,7 @@ def generate_response(model, msgs, topn, temperature, args, open_source_model, t
             response_list.append(get_completion_starcoder('', user_input, open_source_model, tokenizer, args))
         return response_list        
     elif 'Llama' in args.model or 'deepseek' in args.model or 'CodeQwen' in args.model:
-        user_input = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True, return_tensors="pt")
+        user_input = tokenizer.apply_chat_template(msgs, tokenize=False) if 'Llama' in args.model else tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True, return_tensors="pt")
         for i in range(topn):
             if 'two-shot' in args.model:
                 response_list.append(get_completion_codellama_instruct_nl_to_pl(CODELLAMA_NL_2_PL_HUMANEVAL, user_input_without_prompt, open_source_model, tokenizer, args))
diff --git a/intermedia_analyze.py b/intermedia_analyze.py
@@ -144,7 +144,7 @@ def analyze_process_HumanEval(log_file, original_prompt_file, topn):
     # TODO(jwu): we can read from HumanEval.jsonl instead of HumanEval_new.jsonl, and delete all about HumanEval_new. 
     # Update: no need because HumanEval_new has 'test_case' field, which is essential.
     # with open('HumanEval/HumanEval.jsonl', 'r') as f:
-    with open('HumanEval/HumanEval_new.jsonl', 'r') as f:
+    with open('Benchmark/HumanEval_new.jsonl', 'r') as f:
         for line in f.readlines():
             problem_list.append(json.loads(line))
             # added by JW. not needed since it's just loading HumanEval problems.
diff --git a/scripts/script_stepwise_phase123.bat b/scripts/script_stepwise_phase123.bat
@@ -39,12 +39,21 @@ for %%i in (!string_of_strings!) do (
     ) else if "%2"=="3" (
         rem # extract code and run test cases and other metrics for each problem. input: file in log/  output: file in log/record/
         python intermedia_analyze.py -f log/manualRemove_dataset_HumanEvalComm_model_%%i_topn_1_temperature_1.0.log_3 -n 1
+    ) else if "%2"=="3-1" (
+        rem # extract code and run test cases and other metrics for each problem. input: file in log/  output: file in log/record/
+        python intermedia_analyze.py -f log/manualRemove_dataset_HumanEval_model_%%i_topn_1_temperature_1.0.log_1 -n 1
     ) else if "%2"=="4" (
         rem # compute more metrics for each problem, such as test pass rate, question quality rate, comm. rate, etc. input: file in ./log/record/ output: file in ./result_data/
         python syntactic_similarity_OER.py -e manualRemove_dataset_HumanEvalComm -m %%i -t 1 -o R1 -n 1 -s 3
+    ) else if "%2"=="4-1" (
+        rem # compute more metrics for each problem, such as test pass rate, question quality rate, comm. rate, etc. input: file in ./log/record/ output: file in ./result_data/
+        python syntactic_similarity_OER.py -e manualRemove_dataset_HumanEval -m %%i -t 1 -o R1 -n 1 -s 3
     ) else if "%2"=="5" (
         rem # aggregate and display metrics for all problems
         python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1
+    ) else if "%2"=="5-1" (
+        rem # aggregate and display metrics for all problems
+        python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEval -m %%i -t 1 -n 1
     ) else if "%2"=="6" (
         python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1 -pt prompt1a
         python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1 -pt prompt1c