cuplv
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎data/check_tokens.py
+35 b/‎data/check_tokens.py
+35
diff --git a/‎data/train_sql.json
+28,002 b/‎data/train_sql.json
+28,002
diff --git a/‎data/train_sql_skeleton.json
+28,002 b/‎data/train_sql_skeleton.json
+28,002
diff --git a/‎data/validation_sql.json
+5,172 b/‎data/validation_sql.json
+5,172
diff --git a/‎data/validation_sql_skeleton.json
+5,172 b/‎data/validation_sql_skeleton.json
+5,172
diff --git a/‎eval/generate_predict_eval.ipynb
+9-17 b/‎eval/generate_predict_eval.ipynb
+9-17
diff --git a/‎generate_finetuning_data.py renamed to ‎generate-finetune-data.py
+16-16 b/‎generate_finetuning_data.py renamed to ‎generate-finetune-data.py
+16-16
@@ -1,4 +1,5 @@
 /database
 .sql
 .sqlite
-eval/data/database
+eval/data/database
+.DS_Store
@@ -0,0 +1,35 @@
+import json
+
+# Load the json data
+with open('train_sql_skeleton.json') as f:
+    data = json.load(f)
+
+# Initial longest token count is 0
+longest_token_count = 0
+
+# Initial total token count is 0
+total_token_count = 0
+
+longest_text = ""
+
+for item in data:
+    # Get the 'train_instruct' field
+    train_instruct = item['text']
+    
+    # Calculate the token count by dividing the character count by 3.6 and rounding up
+    token_count = round(len(train_instruct) / 3.6)
+    
+    # Add the token count to the total
+    total_token_count += token_count
+
+    # If this token count is the longest, update longest_token_count
+    if token_count > longest_token_count:
+        longest_token_count = token_count
+        longest_text = train_instruct
+
+# Calculate the average token count
+average_token_count = total_token_count / len(data)
+
+print(f"The longest token count for 'train_instruct' in the dataset is {longest_token_count}.")
+print(f"The average token count for 'train_instruct' in the dataset is {average_token_count}.")
+print(f"The longest text for 'train_instruct' in the dataset is {longest_text}.")
@@ -10,47 +10,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Found cached dataset json (/Users/richardroberson/.cache/huggingface/datasets/richardr1126___json/richardr1126--spider-natsql-context-validation-b246ae2fc7e9e5cb/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
+      "Found cached dataset json (/Users/richardroberson/.cache/huggingface/datasets/richardr1126___json/richardr1126--spider-context-validation-8fba68e4e3727374/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "642df2eb63a3414782afd0b0074db10a",
+       "model_id": "3f604c3cd41a4f3ba02e0405f22bb98d",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "Generating responses:   0%|          | 0/498 [00:00<?, ?it/s]"
+       "Generating responses:   0%|          | 0/1027 [00:00<?, ?it/s]"
       ]
      },
      "metadata": {},
      "output_type": "display_data"
     },
     {
-     "ename": "JSONDecodeError",
-     "evalue": "Expecting value: line 1 column 1 (char 0)",
+     "ename": "KeyError",
+     "evalue": "'results'",
      "output_type": "error",
      "traceback": [
       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
-      "File \u001b[0;32m~/miniforge3/envs/llm/lib/python3.11/site-packages/requests/models.py:971\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    970\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 971\u001b[0m     \u001b[39mreturn\u001b[39;00m complexjson\u001b[39m.\u001b[39;49mloads(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mtext, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    972\u001b[0m \u001b[39mexcept\u001b[39;00m JSONDecodeError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    973\u001b[0m     \u001b[39m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m    974\u001b[0m     \u001b[39m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n",
-      "File \u001b[0;32m~/miniforge3/envs/llm/lib/python3.11/json/__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m    343\u001b[0m \u001b[39mif\u001b[39;00m (\u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m    344\u001b[0m         parse_int \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m parse_float \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m    345\u001b[0m         parse_constant \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_pairs_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m kw):\n\u001b[0;32m--> 346\u001b[0m     \u001b[39mreturn\u001b[39;00m _default_decoder\u001b[39m.\u001b[39;49mdecode(s)\n\u001b[1;32m    347\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m~/miniforge3/envs/llm/lib/python3.11/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m    333\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[39mcontaining a JSON document).\u001b[39;00m\n\u001b[1;32m    335\u001b[0m \n\u001b[1;32m    336\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mraw_decode(s, idx\u001b[39m=\u001b[39;49m_w(s, \u001b[39m0\u001b[39;49m)\u001b[39m.\u001b[39;49mend())\n\u001b[1;32m    338\u001b[0m end \u001b[39m=\u001b[39m _w(s, end)\u001b[39m.\u001b[39mend()\n",
-      "File \u001b[0;32m~/miniforge3/envs/llm/lib/python3.11/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m    354\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m     \u001b[39mraise\u001b[39;00m JSONDecodeError(\u001b[39m\"\u001b[39m\u001b[39mExpecting value\u001b[39m\u001b[39m\"\u001b[39m, s, err\u001b[39m.\u001b[39mvalue) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m    356\u001b[0m \u001b[39mreturn\u001b[39;00m obj, end\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[10], line 30\u001b[0m\n\u001b[1;32m     27\u001b[0m headers \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mContent-Type\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mapplication/json\u001b[39m\u001b[39m\"\u001b[39m}\n\u001b[1;32m     29\u001b[0m response \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mpost(url, json\u001b[39m=\u001b[39mpayload, headers\u001b[39m=\u001b[39mheaders)\n\u001b[0;32m---> 30\u001b[0m response_text \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39;49mjson()[\u001b[39m\"\u001b[39m\u001b[39mresults\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m     31\u001b[0m response_text \u001b[39m=\u001b[39m response_text\u001b[39m.\u001b[39mreplace(\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     33\u001b[0m \u001b[39m# append the result to 'results.txt'\u001b[39;00m\n",
-      "File \u001b[0;32m~/miniforge3/envs/llm/lib/python3.11/site-packages/requests/models.py:975\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    971\u001b[0m     \u001b[39mreturn\u001b[39;00m complexjson\u001b[39m.\u001b[39mloads(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mtext, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    972\u001b[0m \u001b[39mexcept\u001b[39;00m JSONDecodeError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    973\u001b[0m     \u001b[39m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m    974\u001b[0m     \u001b[39m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n\u001b[0;32m--> 975\u001b[0m     \u001b[39mraise\u001b[39;00m RequestsJSONDecodeError(e\u001b[39m.\u001b[39mmsg, e\u001b[39m.\u001b[39mdoc, e\u001b[39m.\u001b[39mpos)\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 30\u001b[0m\n\u001b[1;32m     27\u001b[0m headers \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mContent-Type\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mapplication/json\u001b[39m\u001b[39m\"\u001b[39m}\n\u001b[1;32m     29\u001b[0m response \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mpost(url, json\u001b[39m=\u001b[39mpayload, headers\u001b[39m=\u001b[39mheaders)\n\u001b[0;32m---> 30\u001b[0m response_text \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39;49mjson()[\u001b[39m\"\u001b[39;49m\u001b[39mresults\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m     31\u001b[0m response_text \u001b[39m=\u001b[39m response_text\u001b[39m.\u001b[39mreplace(\u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     33\u001b[0m \u001b[39m# append the result to 'results.txt'\u001b[39;00m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'results'"
      ]
     }
    ],
 
@@ -1,9 +1,7 @@
-import json
-
 import json
 import argparse
 
-def process_dataset(input_dataset_path, output_dataset_path, mode):
+def process_dataset(input_dataset_path, output_dataset_path, mode, sql_type, use_skeleton):
     # Load the input dataset
     dataset = json.load(open(input_dataset_path, "r"))
     output_dataset = []
@@ -21,45 +19,47 @@ def process_dataset(input_dataset_path, output_dataset_path, mode):
         for fk in data["fk"]:
             input_sequence += fk["source_table_name_original"]+"."+fk["source_column_name_original"]+" = "+fk["target_table_name_original"]+"."+fk["target_column_name_original"] + " | "
 
-        output_sequence = data["natsql"]
+        if sql_type == "natsql":
+            output_sequence = data["natsql_skeleton"] + " | " + data["natsql"] if use_skeleton else data["natsql"]
+        else: # regular sql
+            output_sequence = data["sql_skeleton"] + " | " + data["norm_sql"] if use_skeleton else data["norm_sql"]
 
         # Generate text for training mode, prompt and ground_truth for validation mode
         if mode == "train":
-            text = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\nConvert text to NatSQL: " + input_sequence + "\n\n" + "### Response:\n\n" + output_sequence
+            text = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\nConvert text to {sql_type}: " + input_sequence + "\n\n" + "### Response:\n\n" + output_sequence
             output_dataset.append({
                 "db_id": db_id,
                 "text": text,
-                #"tc_original": tc_original
             })
         else: # validation mode
-            prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\nConvert text to NatSQL: " + input_sequence + "\n\n### Response:\n\n"
+            prompt = f"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n\nConvert text to {sql_type}: " + input_sequence + "\n\n### Response:\n\n"
             ground_truth = output_sequence
             output_dataset.append({
                 "db_id": db_id,
                 "prompt": prompt,
                 "ground_truth": ground_truth,
-                #"tc_original": tc_original
             })
 
     # Save the output dataset
     with open(output_dataset_path, "w") as f:
         json.dump(output_dataset, f, indent=2, ensure_ascii=False)
 
-def main(mode):
+def main(mode, sql_type, use_skeleton):
     if mode == "train":
-        process_dataset("./data/preprocessed/preprocessed_train_spider_natsql.json", "./data/train.json", "train")
+        process_dataset("./data/preprocessed/preprocessed_train_spider_natsql.json", f"./data/train_{sql_type}{'_skeleton' if use_skeleton else ''}.json", mode, sql_type, use_skeleton)
     elif mode == "validation":
-        process_dataset("./data/preprocessed/preprocessed_dev_natsql.json", "./data/validation.json", "validation")
+        process_dataset("./data/preprocessed/preprocessed_dev_natsql.json", f"./data/validation_{sql_type}{'_skeleton' if use_skeleton else ''}.json", mode, sql_type, use_skeleton)
     elif mode == "both":
-        process_dataset("./data/preprocessed/preprocessed_train_spider_natsql.json", "./data/train.json", "train")
-        process_dataset("./data/preprocessed/preprocessed_dev_natsql.json", "./data/validation.json", "validation")
+        process_dataset("./data/preprocessed/preprocessed_train_spider_natsql.json", f"./data/train_{sql_type}{'_skeleton' if use_skeleton else ''}.json", "train", sql_type, use_skeleton)
+        process_dataset("./data/preprocessed/preprocessed_dev_natsql.json", f"./data/validation_{sql_type}{'_skeleton' if use_skeleton else ''}.json", "validation", sql_type, use_skeleton)
     else:
         print("Specify mode flag with `--mode [train / validation / both].")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--mode', type=str, required=True, help="Specify mode flag with `--mode [train / validation / both].")
+    parser.add_argument('--mode', type=str, default="both", help="Specify mode flag with `--mode [train / validation / both].")
+    parser.add_argument('--sql_type', type=str, required=True, help="Specify SQL type with `--sql_type [natsql / sql].")
+    parser.add_argument('--skeleton', action='store_true', default=False, help="Use SQL skeleton in the output sequence.")
     args = parser.parse_args()
 
-    main(args.mode)
-
+    main(args.mode, args.sql_type, args.skeleton)