Skip to content

Commit 4ad96a0

Browse files
committed
fix codellama result issue (wrong prompt format)
1 parent a8c913f commit 4ad96a0

File tree

3 files changed

+11
-2
lines changed

3 files changed

+11
-2
lines changed

generate_response.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,7 @@ def generate_response(model, msgs, topn, temperature, args, open_source_model, t
783783
response_list.append(get_completion_starcoder('', user_input, open_source_model, tokenizer, args))
784784
return response_list
785785
elif 'Llama' in args.model or 'deepseek' in args.model or 'CodeQwen' in args.model:
786-
user_input = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True, return_tensors="pt")
786+
user_input = tokenizer.apply_chat_template(msgs, tokenize=False) if 'Llama' in args.model else tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True, return_tensors="pt")
787787
for i in range(topn):
788788
if 'two-shot' in args.model:
789789
response_list.append(get_completion_codellama_instruct_nl_to_pl(CODELLAMA_NL_2_PL_HUMANEVAL, user_input_without_prompt, open_source_model, tokenizer, args))

intermedia_analyze.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def analyze_process_HumanEval(log_file, original_prompt_file, topn):
144144
# TODO(jwu): we can read from HumanEval.jsonl instead of HumanEval_new.jsonl, and delete all about HumanEval_new.
145145
# Update: no need because HumanEval_new has 'test_case' field, which is essential.
146146
# with open('HumanEval/HumanEval.jsonl', 'r') as f:
147-
with open('HumanEval/HumanEval_new.jsonl', 'r') as f:
147+
with open('Benchmark/HumanEval_new.jsonl', 'r') as f:
148148
for line in f.readlines():
149149
problem_list.append(json.loads(line))
150150
# added by JW. not needed since it's just loading HumanEval problems.

scripts/script_stepwise_phase123.bat

+9
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,21 @@ for %%i in (!string_of_strings!) do (
3939
) else if "%2"=="3" (
4040
rem # extract code and run test cases and other metrics for each problem. input: file in log/ output: file in log/record/
4141
python intermedia_analyze.py -f log/manualRemove_dataset_HumanEvalComm_model_%%i_topn_1_temperature_1.0.log_3 -n 1
42+
) else if "%2"=="3-1" (
43+
rem # extract code and run test cases and other metrics for each problem. input: file in log/ output: file in log/record/
44+
python intermedia_analyze.py -f log/manualRemove_dataset_HumanEval_model_%%i_topn_1_temperature_1.0.log_1 -n 1
4245
) else if "%2"=="4" (
4346
rem # compute more metrics for each problem, such as test pass rate, question quality rate, comm. rate, etc. input: file in ./log/record/ output: file in ./result_data/
4447
python syntactic_similarity_OER.py -e manualRemove_dataset_HumanEvalComm -m %%i -t 1 -o R1 -n 1 -s 3
48+
) else if "%2"=="4-1" (
49+
rem # compute more metrics for each problem, such as test pass rate, question quality rate, comm. rate, etc. input: file in ./log/record/ output: file in ./result_data/
50+
python syntactic_similarity_OER.py -e manualRemove_dataset_HumanEval -m %%i -t 1 -o R1 -n 1 -s 3
4551
) else if "%2"=="5" (
4652
rem # aggregate and display metrics for all problems
4753
python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1
54+
) else if "%2"=="5-1" (
55+
rem # aggregate and display metrics for all problems
56+
python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEval -m %%i -t 1 -n 1
4857
) else if "%2"=="6" (
4958
python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1 -pt prompt1a
5059
python measurement_summary_draw_heatmap.py -e manualRemove -d HumanEvalComm -m %%i -t 1 -n 1 -pt prompt1c

0 commit comments

Comments
 (0)