Skip to content

Commit a671c9a

Browse files
committed
Init
0 parents  commit a671c9a

File tree

91 files changed

+3070945
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+3070945
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/database
2+
.sql
3+
.sqlite
4+
eval/data/database

README.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#### Generate training data command:
2+
```
3+
python generate_finetuning_data.py --mode both
4+
```
5+
6+
#### Convert HF to GGML command:
7+
```
8+
python convert-hf-to-ggml.py [HF_MODEL_NAME]
9+
```
10+
11+
#### Quantize WizardCoder model command:
12+
```
13+
./starcoder-quantize ./models/[HF_MODEL_NAME]-ggml.bin [HF_MODEL_NAME]-q4_0.bin 2
14+
```
15+
16+
#### Evaluate:
17+
- Use the `generate_predict_eval.ipynb` notebook to generate predictions.
18+
```
19+
cd eval
20+
python evaluation.py --input [NatSQL skeleton + predicted NatSQL file]
21+
```

convert-hf-to-ggml.py

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# Convert HF models to ggml format
2+
#
3+
4+
import sys
5+
import struct
6+
import json
7+
import torch
8+
import numpy as np
9+
import re
10+
import os
11+
12+
from transformers import AutoModelForCausalLM
13+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
14+
15+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
16+
def bytes_to_unicode():
17+
"""
18+
Returns list of utf-8 byte and a corresponding list of unicode strings.
19+
The reversible bpe codes work on unicode strings.
20+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
21+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
22+
This is a signficant percentage of your normal, say, 32K bpe vocab.
23+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
24+
And avoids mapping to whitespace/control characters the bpe code barfs on.
25+
"""
26+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
27+
cs = bs[:]
28+
n = 0
29+
for b in range(2**8):
30+
if b not in bs:
31+
bs.append(b)
32+
cs.append(2**8+n)
33+
n += 1
34+
cs = [chr(n) for n in cs]
35+
return dict(zip(bs, cs))
36+
37+
if len(sys.argv) < 2:
38+
print("Usage: python convert-hf-to-ggml.py hf-model-name [use-f32]")
39+
print("Example: python convert-hf-to-ggml.py bigcode/gpt_bigcode-santacoder")
40+
print("Example: python convert-hf-to-ggml.py bigcode/starcoder")
41+
sys.exit(1)
42+
43+
model_name = sys.argv[1].strip()
44+
fname_out = "models/" + sys.argv[1].strip() + "-ggml.bin"
45+
os.makedirs(os.path.dirname(fname_out), exist_ok=True)
46+
47+
48+
49+
# use 16-bit or 32-bit floats
50+
use_f16 = True
51+
if len(sys.argv) > 2:
52+
use_f16 = False
53+
54+
print("Loading model: ", model_name)
55+
tokenizer = AutoTokenizer.from_pretrained(model_name)
56+
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
57+
hparams = config.to_dict()
58+
model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=torch.float16 if use_f16 else torch.float32, low_cpu_mem_usage=True, trust_remote_code=True, offload_state_dict=True)
59+
print("Model loaded: ", model_name)
60+
61+
#print (model)
62+
63+
list_vars = model.state_dict()
64+
#print (list_vars)
65+
66+
encoder = tokenizer.vocab
67+
# Add added_tokens (special tokens) to the encoder
68+
encoder.update(tokenizer.get_added_vocab())
69+
print(hparams)
70+
71+
print("Saving ggml model to: ", fname_out)
72+
fout = open(fname_out, "wb")
73+
74+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
75+
vocab_size = hparams["vocab_size"]
76+
fout.write(struct.pack("i", vocab_size))
77+
# fout.write(struct.pack("i", len(encoder)))
78+
fout.write(struct.pack("i", hparams["n_positions"]))
79+
fout.write(struct.pack("i", hparams["n_embd"]))
80+
fout.write(struct.pack("i", hparams["n_head"]))
81+
fout.write(struct.pack("i", hparams["n_layer"]))
82+
fout.write(struct.pack("i", use_f16))
83+
84+
byte_encoder = bytes_to_unicode()
85+
byte_decoder = {v:k for k, v in byte_encoder.items()}
86+
87+
fout.write(struct.pack("i", vocab_size))
88+
89+
counter = 0
90+
# sort by value
91+
for key in sorted(encoder, key=encoder.get):
92+
text = bytearray([byte_decoder[c] for c in key])
93+
fout.write(struct.pack("i", len(text)))
94+
fout.write(text)
95+
counter += 1
96+
97+
# TODO: Repeat last token until vocab_size
98+
while counter < vocab_size:
99+
fout.write(struct.pack("i", len(text)))
100+
fout.write(text)
101+
counter += 1
102+
# assert counter == config.vocab_size
103+
104+
for name in list_vars.keys():
105+
data = list_vars[name].squeeze().numpy()
106+
print("Processing variable: " + name + " with shape: ", data.shape)
107+
108+
# rename headers to keep compatibility
109+
if name == "transformer.ln_f.weight":
110+
name = "model/ln_f/g"
111+
elif name == "transformer.ln_f.bias":
112+
name = "model/ln_f/b"
113+
elif name == "transformer.wte.weight":
114+
name = "model/wte"
115+
elif name == "transformer.wpe.weight":
116+
name = "model/wpe"
117+
elif name == "lm_head.weight":
118+
name = "model/lm_head"
119+
elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
120+
i = re.findall("\d+", name)[0]
121+
name = f"model/h{i}/ln_1/g"
122+
elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
123+
i = re.findall("\d+", name)[0]
124+
name = f"model/h{i}/ln_1/b"
125+
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
126+
i = re.findall("\d+", name)[0]
127+
name = f"model/h{i}/attn/c_attn/w"
128+
elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
129+
i = re.findall("\d+", name)[0]
130+
name = f"model/h{i}/attn/c_attn/b"
131+
elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
132+
i = re.findall("\d+", name)[0]
133+
name = f"model/h{i}/attn/c_proj/w"
134+
elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
135+
i = re.findall("\d+", name)[0]
136+
name = f"model/h{i}/attn/c_proj/b"
137+
elif re.match(r"transformer.h.\d+.ln_2.weight", name):
138+
i = re.findall("\d+", name)[0]
139+
name = f"model/h{i}/ln_2/g"
140+
elif re.match(r"transformer.h.\d+.ln_2.bias", name):
141+
i = re.findall("\d+", name)[0]
142+
name = f"model/h{i}/ln_2/b"
143+
elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
144+
i = re.findall("\d+", name)[0]
145+
name = f"model/h{i}/mlp/c_fc/w"
146+
elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
147+
i = re.findall("\d+", name)[0]
148+
name = f"model/h{i}/mlp/c_fc/b"
149+
elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
150+
i = re.findall("\d+", name)[0]
151+
name = f"model/h{i}/mlp/c_proj/w"
152+
elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
153+
i = re.findall("\d+", name)[0]
154+
name = f"model/h{i}/mlp/c_proj/b"
155+
else:
156+
print("Unrecognized variable name. %s", name)
157+
158+
# we don't need these
159+
if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
160+
print(" Skipping variable: " + name)
161+
continue
162+
163+
n_dims = len(data.shape);
164+
165+
# ftype == 0 -> float32, ftype == 1 -> float16
166+
ftype = 0;
167+
if use_f16:
168+
if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
169+
print(" Converting to float16")
170+
data = data.astype(np.float16)
171+
ftype = 1
172+
else:
173+
print(" Converting to float32")
174+
data = data.astype(np.float32)
175+
ftype = 0
176+
177+
"model/h.*/attn/c_attn/w"
178+
"model/h.*/attn/c_proj/w"
179+
"model/h.*/mlp/c_fc/w"
180+
"model/h.*/mlp/c_proj/w"
181+
if name[-14:] == "/attn/c_attn/w" or name[-14:] == "/attn/c_attn/b":
182+
print(" Duplicate K,V heads to use MHA instead of MQA")
183+
184+
embed_dim = hparams["n_embd"]
185+
head_dim = embed_dim // hparams["n_head"]
186+
187+
# ((n_heads + 2) * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
188+
q, k ,v = np.split(data, (hparams["n_head"] * head_dim, (hparams["n_head"] + 1) * head_dim), axis=0)
189+
# duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
190+
if len(k.shape) == 2:
191+
k = np.tile(k, (hparams["n_head"], 1))
192+
v = np.tile(v, (hparams["n_head"], 1))
193+
elif len(k.shape) == 1:
194+
k = np.tile(k, (hparams["n_head"]))
195+
v = np.tile(v, (hparams["n_head"]))
196+
# concat q, k, v along the first axis (n_heads * head_dim, hidden_dim) -> (3 * n_heads * head_dim, hidden_dim)
197+
data = np.concatenate((q, k, v), axis=0)
198+
199+
# header
200+
str = name.encode('utf-8')
201+
fout.write(struct.pack("iii", n_dims, len(str), ftype))
202+
for i in range(n_dims):
203+
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
204+
fout.write(str);
205+
206+
# data
207+
data.tofile(fout)
208+
209+
fout.close()
210+
211+
print("Done. Output file: " + fname_out)
212+
print("")

0 commit comments

Comments
 (0)