|
| 1 | +#%% |
| 2 | +#%% |
| 3 | +import openai |
| 4 | +from methods.methods import * |
| 5 | +import time |
| 6 | +from tqdm import tqdm |
| 7 | +import copy |
| 8 | +### LOAD EVALUATION AND QRELS |
| 9 | +evaluation_path = './trec/treccast/' |
| 10 | +qrels_path = './trec/qrels/' |
| 11 | +all_qrels = load_all_qrels(qrels_path).reset_index(drop=True) |
| 12 | +qrels19 = all_qrels[all_qrels.year == 2019] |
| 13 | +eval_19 = pd.read_csv('./trec/evaluation2019.csv') |
| 14 | +eval_20 = pd.read_csv('./trec/evaluation2020.csv') |
| 15 | +eval_21 = pd.read_csv('./trec/evaluation2021.csv') |
| 16 | +madda_0_key = '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'#maddaoriginal |
| 17 | +guido_1_key ='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'#shared con cris e franco @gmail |
| 18 | +madda_2_key = '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' #fortebraccio |
| 19 | +guido_3_key = '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' |
| 20 | +cris_4_key = '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' |
| 21 | +openai.api_key = madda_2_key |
| 22 | + |
| 23 | +import re |
| 24 | +def sub_(x): |
| 25 | + x = re.sub('\n- .*','',x) |
| 26 | + x = re.sub('De-contextualized rewrite under the multi-turn information-seeking dialog context:','',x) |
| 27 | + x = re.sub('Response:.*','',x) |
| 28 | + x = re.sub('\nCurrent question.*\n.*','',x) |
| 29 | + x = re.sub('Previous question:.*\nRewritten.*','',x) |
| 30 | + x = re.sub('\t.*sorry.*','',x) |
| 31 | + x = re.sub('"Earlier.*\.','',x) |
| 32 | + x = re.sub('Earlier, we.*\.','',x) |
| 33 | + x = re.sub('Keywords added:.*','',x) |
| 34 | + x = re.sub('"keywords:.*','',x) |
| 35 | + x = re.sub('Response:.*','',x) |
| 36 | + x = re.sub('User: .*','',x) |
| 37 | + x = re.sub('AI assistant:.*','',x) |
| 38 | + x = re.sub('Response: .*','',x) |
| 39 | + x = re.sub('"Current question:.*','',x) |
| 40 | + x = re.sub('Current question:.*','',x) |
| 41 | + x = re.sub('"Context: ','',x) |
| 42 | + x = re.sub('Context: ','',x) |
| 43 | + x = re.sub('','',x) |
| 44 | + x = re.sub('','',x) |
| 45 | + x = re.sub('"Reformulated question:','',x) |
| 46 | + x = re.sub('Reformulated question:','',x) |
| 47 | + x = re.sub('Reformulated question : ','',x) |
| 48 | + x = re.sub('Reformulated question: ','',x) |
| 49 | + x = re.sub('"Rephrased question: ','',x) |
| 50 | + x = re.sub('Rephrased question: ','',x) |
| 51 | + x = re.sub('"Request for conversational system: .*\n\nRewritten request: "','',x) |
| 52 | + x = re.sub('Request for conversational system:.*\n\nRewritten request: "','',x) |
| 53 | + x = re.sub('Previous keywords: ','',x) |
| 54 | + x = re.sub('"From the previous question:.*','',x) |
| 55 | + x = re.sub('From the previous question:.*','',x) |
| 56 | + x = re.sub('"Previous context:.*','',x) |
| 57 | + x = re.sub('Previous context:.*','',x) |
| 58 | + x = re.sub('"Keywords: ','',x) |
| 59 | + x = re.sub('Keywords: ','',x) |
| 60 | + x = re.sub('\n\n','',x) |
| 61 | + x = re.sub('"Search keywords: ','',x) |
| 62 | + x = re.sub('Search keywords: ','',x) |
| 63 | + x = re.sub('Prompt: ','',x) |
| 64 | + x = re.sub('Prompt for search engine: ','',x) |
| 65 | + x = re.sub('Search Engine Prompt: ','',x) |
| 66 | + x = re.sub("I'm sorry, but your current question",'',x) |
| 67 | + x = re.sub('lacks sufficient context .*','',x) |
| 68 | + x = re.sub('Query for a search engine: ','',x) |
| 69 | + x = re.sub('Search prompt: ','',x) |
| 70 | + x = re.sub('Search engine prompt: ','',x) |
| 71 | + x = re.sub('Search engine prompt:','',x) |
| 72 | + x = re.sub('Rewritten question: ','',x) |
| 73 | + x = re.sub('Rewritten question:','',x) |
| 74 | + x = re.sub('Request for a retrieval system: ','',x) |
| 75 | + x = re.sub('Request for a retrieval system:','',x) |
| 76 | + x = re.sub('"Request for clarification: ','',x) |
| 77 | + x = re.sub('Request for clarification:','',x) |
| 78 | + x = re.sub('Request: ','',x) |
| 79 | + x = re.sub('"Request for clarification: ','',x) |
| 80 | + x = re.sub('.*answer: ','',x) |
| 81 | + x = re.sub('Answer:.*','',x) |
| 82 | + x = re.sub('Question: ','',x) |
| 83 | + x = re.sub('"OP:.*','',x) |
| 84 | + x = re.sub('Request for retrieval system: ','',x) |
| 85 | + x = re.sub('Revised question: ','',x) |
| 86 | + x = re.sub('Could you please clarify your question?','',x) |
| 87 | + x = re.sub('Building on the previous questions:','',x) |
| 88 | + x = re.sub('Building on the previous questions','',x) |
| 89 | + x = re.sub('Building on previously asked questions ','',x) |
| 90 | + x = re.sub('Reformulated question in a multi-turn information-seeking dialog context: ','',x) |
| 91 | + x = re.sub('Rewritten: ','',x) |
| 92 | + x = re.sub('Reformulated: ','',x) |
| 93 | + x = re.sub("Revised: ",'',x) |
| 94 | + |
| 95 | + x = re.sub('\(.*\)','',x) |
| 96 | + x = re.sub('"','',x) |
| 97 | + x = re.sub('"\n','',x) |
| 98 | + x = re.sub('\?.*','?',x) |
| 99 | + x = re.sub("I'm sorry, but .*",'',x) |
| 100 | + return x |
| 101 | + |
| 102 | + |
| 103 | +def sustitube(x): |
| 104 | + x = re.sub("\(.*\)","",x) |
| 105 | + return x |
| 106 | + |
| 107 | + |
| 108 | +def chatgpt(messages:list, model = 'gpt-3.5-turbo'): |
| 109 | + response = openai.ChatCompletion.create( |
| 110 | + model=model, |
| 111 | + messages=messages) |
| 112 | + return response.choices[0]['message']['content'] |
| 113 | + |
| 114 | + |
| 115 | +_2020 = create_df_from_json(pd.read_json('/data4/guidorocchietti/GPT_clean/trec/treccast/2020_manual_evaluation_topics_v1.0.json')) |
| 116 | +_2020['conv_id'] = [x.split('_')[0] for x in _2020.number] |
| 117 | +_2020['turn'] = [x.split('_')[1] for x in _2020.number] |
| 118 | +qrels20 =load_qrels('/data4/guidorocchietti/GPT_clean/trec/qrels/2020qrels.txt') |
| 119 | + |
| 120 | +_2019 = pd.read_csv('/data4/guidorocchietti/GPT_clean/trec/evaluation2019.csv') |
| 121 | +manual_2019 = pd.read_csv('/data4/guidorocchietti/GPT_clean/trec/treccast/test_manual_utterance.tsv', sep = '\t', names=['qid','manual_rewritten_utterance']) |
| 122 | +_2019 = _2019.merge(manual_2019,on='qid') |
| 123 | +# %% |
| 124 | +''' |
| 125 | +system_text = 'In a multi-turn dialog system, rewrite the given sentence to be self-explanatory. Use elements of the previous sentences to generate better sentences.' |
| 126 | +messages= [{"role": "system", "content": system_text}] |
| 127 | +example = [] |
| 128 | +conv = _2020[_2020.conv.isin(['81','82'])] |
| 129 | +
|
| 130 | +for convid in conv.conv.unique(): |
| 131 | + part = conv[conv.conv == convid] |
| 132 | + for turn in part.turn: |
| 133 | + if int(turn)<7: |
| 134 | + if convid =='82' and turn =='1': |
| 135 | + example.append({"role": "user", "content": 'New conversation.'}) |
| 136 | + example.append({"role": "user", "content": part[(part.turn == turn)].raw_utterance.iloc[0]}) |
| 137 | + example.append({"role": "assistant", "content": part[(part.turn == turn)].manual_rewritten_utterance.iloc[0]}) |
| 138 | + |
| 139 | +messages += example |
| 140 | +''' |
| 141 | + |
| 142 | +def create_messages(system_text,command,current_utterance,previous_utterances=[],previous_outputs=[],previous_in_current = False,prompt_in_input=False): |
| 143 | + if prompt_in_input: |
| 144 | + messages = [] |
| 145 | + else: |
| 146 | + messages = [{"role": "system", "content": system_text}] |
| 147 | + if previous_in_current: |
| 148 | + if previous_utterances == []: |
| 149 | + new_current = '' |
| 150 | + else: |
| 151 | + new_current = 'Previous context:' |
| 152 | + for x,y in zip(previous_utterances,previous_outputs): |
| 153 | + messages.append({"role": "user", "content": x}) |
| 154 | + messages.append({"role": "assistant", "content":y}) |
| 155 | + new_current += f"{x} "#f"original: {x}, rewritten:{y} " |
| 156 | + new_current += command + current_utterance |
| 157 | + if prompt_in_input: |
| 158 | + new_current = system_text + new_current |
| 159 | + messages.append({"role": "user", "content": new_current}) |
| 160 | + else: |
| 161 | + for x,y in zip(previous_utterances,previous_outputs): |
| 162 | + messages.append({"role": "user", "content":command + x}) |
| 163 | + messages.append({"role": "assistant", "content":y}) |
| 164 | + if prompt_in_input: |
| 165 | + messages.append({"role": "user", "content":system_text+ " " + command + current_utterance}) |
| 166 | + else: |
| 167 | + messages.append({"role": "user", "content":system_text+ " " + command + current_utterance}) |
| 168 | + return messages |
| 169 | + |
| 170 | + |
| 171 | +def chatgpt_for_df(evaluation,system_text = 'In a multi-turn dialog system, rewrite the given sentence to be self-explanatory following the pattern of the previous interactions.',year=2019): |
| 172 | + current_utterance = evaluation.raw_utterance.iloc[0] |
| 173 | + |
| 174 | + starting_message= [{"role": "system", "content": system_text}] |
| 175 | + dictionary = {'qid':{},'query' :{},'current_input':{}} |
| 176 | + ind = 0 |
| 177 | + rate_limit_per_minute = 20 |
| 178 | + delay = 60.0 / rate_limit_per_minute |
| 179 | + example = [] |
| 180 | + if year == 2019: |
| 181 | + conv = _2020[_2020.conv_id.isin(['81','82'])] |
| 182 | + conv_id_stop ='82' |
| 183 | + elif year ==2020: |
| 184 | + conv = _2019[_2019.conv_id.isin([31,32])] |
| 185 | + conv_id_stop =32 |
| 186 | + |
| 187 | + for convid in conv.conv_id.unique(): |
| 188 | + part = conv[conv.conv_id == convid] |
| 189 | + for turn in part.turn: |
| 190 | + if int(turn)< 9: |
| 191 | + if convid ==conv_id_stop and turn =='1': |
| 192 | + example.append({"role": "user", "content": 'New conversation.'}) |
| 193 | + example.append({"role": "user", "content": part[(part.turn == turn)].raw_utterance.iloc[0]}) |
| 194 | + example.append({"role": "assistant", "content": part[(part.turn == turn)].manual_rewritten_utterance.iloc[0]}) |
| 195 | + starting_message += example |
| 196 | + |
| 197 | + prompts = {'qid':{},'prompt' :{}} |
| 198 | + #previous_utterances = [] |
| 199 | + #previous_outputs = [] |
| 200 | + for conv_id in tqdm(evaluation.conv_id.unique()): |
| 201 | + conv = evaluation[evaluation.conv_id ==conv_id] |
| 202 | + previous_utterances = [] |
| 203 | + previous_outputs = [] |
| 204 | + for qid in (conv.qid): |
| 205 | + time.sleep(delay) |
| 206 | + try : |
| 207 | + messages = copy.copy(starting_message) |
| 208 | + current_utterance = conv[conv.qid == qid].raw_utterance.iloc[0] |
| 209 | + for x,y in zip(previous_utterances,previous_outputs): |
| 210 | + messages.append({"role": "user", "content": x}) |
| 211 | + messages.append({"role": "assistant", "content":sustitube(y)}) |
| 212 | + #message = create_messages(system_text,current_command,current_utterance,previous_utterances,previous_outputs,previous_in_current = previous_in_current,prompt_in_input=prompt_in_input) |
| 213 | + message = copy.copy(messages) |
| 214 | + message += [{"role": "user", "content": system_text + current_utterance}] |
| 215 | + prompts['qid'][ind] = qid |
| 216 | + prompts['prompt'][ind] = message |
| 217 | + #print(message) |
| 218 | + response = chatgpt(message) |
| 219 | + previous_utterances.append(current_utterance) |
| 220 | + previous_outputs.append(response) |
| 221 | + dictionary['qid'][ind] = qid |
| 222 | + dictionary['query'][ind] = response |
| 223 | + dictionary['current_input'][ind] = message[-1]['content'] |
| 224 | + ind+=1 |
| 225 | + except: |
| 226 | + try : |
| 227 | + time.sleep(delay*2) |
| 228 | + messages = copy.copy(starting_message) |
| 229 | + current_utterance = conv[conv.qid == qid].raw_utterance.iloc[0] |
| 230 | + for x,y in zip(previous_utterances,previous_outputs): |
| 231 | + messages.append({"role": "user", "content": x}) |
| 232 | + messages.append({"role": "assistant", "content":sustitube(y)}) |
| 233 | + #message = create_messages(system_text,current_command,current_utterance,previous_utterances,previous_outputs,previous_in_current = previous_in_current,prompt_in_input=prompt_in_input) |
| 234 | + message = copy.copy(messages) |
| 235 | + message += [{"role": "user", "content": system_text + current_utterance}] |
| 236 | + prompts['qid'][ind] = qid |
| 237 | + prompts['prompt'][ind] = message |
| 238 | + #print(message) |
| 239 | + response = chatgpt(message) |
| 240 | + previous_utterances.append(current_utterance) |
| 241 | + previous_outputs.append(response) |
| 242 | + dictionary['qid'][ind] = qid |
| 243 | + dictionary['query'][ind] = response |
| 244 | + dictionary['current_input'][ind] = message[-1]['content'] |
| 245 | + ind+=1 |
| 246 | + except Exception as e: |
| 247 | + try : |
| 248 | + messages = copy.copy(starting_message) |
| 249 | + time.sleep(delay*3) |
| 250 | + current_utterance = conv[conv.qid == qid].raw_utterance.iloc[0] |
| 251 | + for x,y in zip(previous_utterances,previous_outputs): |
| 252 | + messages.append({"role": "user", "content": x}) |
| 253 | + messages.append({"role": "assistant", "content":sustitube(y)}) |
| 254 | + #message = create_messages(system_text,current_command,current_utterance,previous_utterances,previous_outputs,previous_in_current = previous_in_current,prompt_in_input=prompt_in_input) |
| 255 | + message = copy.copy(messages) |
| 256 | + message += [{"role": "user", "content": system_text + current_utterance}] |
| 257 | + prompts['qid'][ind] = qid |
| 258 | + prompts['prompt'][ind] = message |
| 259 | + #print(message) |
| 260 | + response = chatgpt(message) |
| 261 | + previous_utterances.append(current_utterance) |
| 262 | + previous_outputs.append(response) |
| 263 | + dictionary['qid'][ind] = qid |
| 264 | + dictionary['query'][ind] = response |
| 265 | + dictionary['current_input'][ind] = message[-1]['content'] |
| 266 | + ind+=1 |
| 267 | + except Exception as e: |
| 268 | + print(e) |
| 269 | + try : |
| 270 | + messages = copy.copy(starting_message) |
| 271 | + time.sleep(delay*4) |
| 272 | + current_utterance = conv[conv.qid == qid].raw_utterance.iloc[0] |
| 273 | + for x,y in zip(previous_utterances,previous_outputs): |
| 274 | + messages.append({"role": "user", "content": x}) |
| 275 | + messages.append({"role": "assistant", "content":sustitube(y)}) |
| 276 | + #message = create_messages(system_text,current_command,current_utterance,previous_utterances,previous_outputs,previous_in_current = previous_in_current,prompt_in_input=prompt_in_input) |
| 277 | + message = copy.copy(messages) |
| 278 | + message += [{"role": "user", "content": system_text + current_utterance}] |
| 279 | + prompts['qid'][ind] = qid |
| 280 | + prompts['prompt'][ind] = message |
| 281 | + #print(message) |
| 282 | + response = chatgpt(message) |
| 283 | + previous_utterances.append(current_utterance) |
| 284 | + previous_outputs.append(response) |
| 285 | + dictionary['qid'][ind] = qid |
| 286 | + dictionary['query'][ind] = response |
| 287 | + dictionary['current_input'][ind] = message[-1]['content'] |
| 288 | + ind+=1 |
| 289 | + except Exception as e: |
| 290 | + print(e) |
| 291 | + print('end of cilcle, missing qid : ',qid) |
| 292 | + messages.append({"role": "user", "content": 'New conversation.'}) |
| 293 | + |
| 294 | + return pd.DataFrame(dictionary), pd.DataFrame(prompts) |
| 295 | + |
| 296 | +# %% |
| 297 | + |
| 298 | +prompts = pd.read_csv('./top5_prompts.csv',sep=';') |
| 299 | +for i in range(len(prompts)): |
| 300 | + |
| 301 | + name = prompts['id'].iloc[i] |
| 302 | + text = prompts['text'].iloc[i] |
| 303 | + if not(os.path.isfile(f'/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/rewritten/{name}_2019.tsv')): |
| 304 | + print('Name : ',name) |
| 305 | + print('Text : ',text) |
| 306 | + |
| 307 | + output, prompt = chatgpt_for_df(eval_19,system_text=text,year=2019) |
| 308 | + output[['qid','query']].to_csv(f'/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/rewritten/{name}_2019.tsv', sep = '\t', index=False) |
| 309 | + prompt.to_csv(f'/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/prompts/{name}_2019.csv') |
| 310 | + #output, prompt = chatgpt_for_df(eval_20,system_text=text,year=2020) |
| 311 | + #output[['qid','query']].to_csv(f'/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/rewritten/{name}_2020.tsv', sep = '\t') |
| 312 | + #prompt.to_csv(f'/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/prompts/{name}_2020.csv') |
| 313 | +#%% |
| 314 | +#output, prompt = chatgpt_for_df(eval_20) |
| 315 | +#output[['qid','query']].to_csv('/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/rewritten/Example_in_history_2020.tsv', sep = '\t', index=False) |
| 316 | +#prompt.to_csv('/data4/guidorocchietti/GPT_clean/ultima_prova/rewritings/prompts/Example_in_history_2020.csv') |
| 317 | +# %% |
0 commit comments