Skip to content
This repository was archived by the owner on Jun 1, 2022. It is now read-only.

Commit 1a751f8

Browse files
TobiasTobias
Tobias
authored and
Tobias
committed
init commit
0 parents  commit 1a751f8

19 files changed

+984
-0
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.DS_Store
2+
**/models
3+
**/venv
4+
**/.vscode
5+

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2019 OpenAI
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# GPT-2 Test
2+
3+
## Intention
4+
5+
To investigate the generated content provided by the GPT-2 117M learned model - [GPT-2 repo](https://github.com/openai/gpt-2.git).
6+
7+
8+
## Setup
9+
10+
+ Clone this repo
11+
+ Open terminal and ```cd``` to the cloned repo location
12+
13+
```
14+
brew install python
15+
pip3 install virtualenv
16+
cd src
17+
virtualenv venv
18+
source venv/bin/activate
19+
pip3 install Flask
20+
pip3 install flask-cors
21+
pip3 install tensorflow==1.13.1
22+
pip3 install -r requirements.txt
23+
python3 download_model.py 117M
24+
```
25+
26+
## Running
27+
28+
In terminal:
29+
30+
```
31+
python3 application.py
32+
```
33+
34+
+ Go to [http://0.0.0.0:80](http://0.0.0.0:80)
35+
+ Insert the text you want to use as a seed in the text field and hit submit
36+
+ A ***waiting for repsonse...*** message will appear until the generated content is...uh...generated, then the content will replace the message
37+
38+
39+
## GPT-2 repo README
40+
41+
This repo is based on the [original GPT-2 repo](https://github.com/openai/gpt-2.git), you can read the setup instructions in this repo's *GPT2-README.md*
42+
43+
44+
## Creating the live serve
45+
46+
Tried to get this up on Heroku or a Lambda but both have storage size limits which are too small for the 500MB model + repo code and requirements.
47+
48+
+ Heroku - 500MB
49+
+ Lambda - 250MB
50+
51+
So had to settle for a custom EC2 build.
52+
53+
54+
### AWS EC2 instance
55+
56+
+ **Step 1**
57+
- Amazon Linux 2 AMI
58+
+ **Step 2**
59+
- Select t2.medium
60+
+ **Step 3:**
61+
- Auto-assign Public IP to *Enable*
62+
- Otherwise Default settings
63+
+ **Step 4**
64+
- Default settings
65+
+ **Step 5**
66+
- TODO: what tags does Labs use
67+
+ **Step 6**
68+
- TODO: what groups does Labs use to give SSH/HTTP/HTTPS access
69+
+ **Step 7**
70+
- Create using new key or ley you already have access to
71+
72+
73+
### Installing the repo and requirements
74+
75+
```
76+
sudo yum install python3
77+
sudo yum install git
78+
sudo yum install gcc
79+
sudo yum install python3-devel
80+
git clone https://github.com/ftlabs/gpt2-gui.git
81+
cd ~/gpt2-gui/src
82+
sudo pip3 install Flask
83+
sudo pip3 install flask-cors
84+
sudo pip3 install tensorflow==1.13.1
85+
sudo pip3 install -r requirements.txt
86+
sudo python3 download_model.py 117M
87+
```
88+
89+
### Running
90+
91+
***Test run***
92+
93+
```
94+
cd ~/gpt2-gui/src
95+
sudo python3 application.py
96+
```
97+
98+
***Keep running***
99+
100+
```
101+
102+
```
103+
104+
105+
## Appendix
106+
107+
+ [Deploying flask to AWS tutorial](https://www.codementor.io/dushyantbgs/deploying-a-flask-application-to-aws-gnva38cf0)
108+
109+
2.5 KB
Binary file not shown.
4.87 KB
Binary file not shown.
2.67 KB
Binary file not shown.

src/__pycache__/hello.cpython-37.pyc

668 Bytes
Binary file not shown.

src/__pycache__/model.cpython-37.pyc

6.56 KB
Binary file not shown.

src/__pycache__/sample.cpython-37.pyc

2.52 KB
Binary file not shown.

src/application.py

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
from flask import Flask, jsonify, request, render_template
2+
from flask_cors import CORS
3+
import fire
4+
import json
5+
import os
6+
import numpy as np
7+
import tensorflow as tf
8+
9+
import model, sample, encoder
10+
11+
app = Flask(__name__)
12+
CORS(app)
13+
14+
@app.route("/")
15+
def include_example():
16+
return render_template("index.html")
17+
18+
@app.route("/gtp2", methods = ['POST'])
19+
def convert():
20+
json = request.get_json()
21+
if 'seed' in json.keys():
22+
print(json)
23+
model_result = interact_model(json['seed'], float(json['temperature']), json['top_k'])
24+
return jsonify(data=model_result)
25+
return 'error'
26+
27+
def interact_model(raw_text, temperature, top_k):
28+
model_name='117M'
29+
seed=None
30+
nsamples=1
31+
batch_size=1
32+
length=None
33+
#temperature=0.7
34+
#top_k=40
35+
36+
"""
37+
Interactively run the model
38+
:model_name=117M : String, which model to use
39+
:seed=None : Integer seed for random number generators, fix seed to reproduce
40+
results
41+
:nsamples=1 : Number of samples to return total
42+
:batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples.
43+
:length=None : Number of tokens in generated text, if None (default), is
44+
determined by model hyperparameters
45+
:temperature=1 : Float value controlling randomness in boltzmann
46+
distribution. Lower temperature results in less random completions. As the
47+
temperature approaches zero, the model will become deterministic and
48+
repetitive. Higher temperature results in more random completions.
49+
:top_k=0 : Integer value controlling diversity. 1 means only 1 word is
50+
considered for each step (token), resulting in deterministic completions,
51+
while 40 means 40 words are considered at each step. 0 (default) is a
52+
special setting meaning no restrictions. 40 generally is a good value.
53+
"""
54+
if batch_size is None:
55+
batch_size = 1
56+
assert nsamples % batch_size == 0
57+
58+
enc = encoder.get_encoder(model_name)
59+
hparams = model.default_hparams()
60+
with open(os.path.join('models', model_name, 'hparams.json')) as f:
61+
hparams.override_from_dict(json.load(f))
62+
63+
if length is None:
64+
length = hparams.n_ctx // 2
65+
elif length > hparams.n_ctx:
66+
raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
67+
68+
with tf.Session(graph=tf.Graph()) as sess:
69+
context = tf.placeholder(tf.int32, [batch_size, None])
70+
np.random.seed(seed)
71+
tf.set_random_seed(seed)
72+
output = sample.sample_sequence(
73+
hparams=hparams, length=length,
74+
context=context,
75+
batch_size=batch_size,
76+
temperature=temperature, top_k=top_k
77+
)
78+
79+
saver = tf.train.Saver()
80+
ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name))
81+
saver.restore(sess, ckpt)
82+
83+
while True:
84+
#raw_text = input("Model prompt >>> ")
85+
#while not raw_text:
86+
#print('Prompt should not be empty!')
87+
#raw_text = input("Model prompt >>> ")
88+
context_tokens = enc.encode(raw_text)
89+
generated = 0
90+
for _ in range(nsamples // batch_size):
91+
out = sess.run(output, feed_dict={
92+
context: [context_tokens for _ in range(batch_size)]
93+
})[:, len(context_tokens):]
94+
for i in range(batch_size):
95+
generated += 1
96+
text = enc.decode(out[i])
97+
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
98+
print(text)
99+
return text
100+
print("=" * 80)
101+
102+
if __name__ == '__main__':
103+
#fire.Fire(interact_model)
104+
app.run(host="0.0.0.0", port=80)

src/download_model.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import os
2+
import sys
3+
import requests
4+
from tqdm import tqdm
5+
6+
if len(sys.argv) != 2:
7+
print('You must enter the model name as a parameter, e.g.: download_model.py 117M')
8+
sys.exit(1)
9+
10+
model = sys.argv[1]
11+
12+
subdir = os.path.join('models', model)
13+
if not os.path.exists(subdir):
14+
os.makedirs(subdir)
15+
16+
for filename in ['checkpoint','encoder.json','hparams.json','model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe']:
17+
18+
r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir + "/" + filename, stream=True)
19+
20+
with open(os.path.join(subdir, filename), 'wb') as f:
21+
file_size = int(r.headers["content-length"])
22+
chunk_size = 1000
23+
with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
24+
# 1k for chunk_size, since Ethernet packet size is around 1500 bytes
25+
for chunk in r.iter_content(chunk_size=chunk_size):
26+
f.write(chunk)
27+
pbar.update(chunk_size)

src/encoder.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""Byte pair encoding utilities"""
2+
3+
import os
4+
import json
5+
import regex as re
6+
from functools import lru_cache
7+
8+
@lru_cache()
9+
def bytes_to_unicode():
10+
"""
11+
Returns list of utf-8 byte and a corresponding list of unicode strings.
12+
The reversible bpe codes work on unicode strings.
13+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
14+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
15+
This is a signficant percentage of your normal, say, 32K bpe vocab.
16+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
17+
And avoids mapping to whitespace/control characters the bpe code barfs on.
18+
"""
19+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
20+
cs = bs[:]
21+
n = 0
22+
for b in range(2**8):
23+
if b not in bs:
24+
bs.append(b)
25+
cs.append(2**8+n)
26+
n += 1
27+
cs = [chr(n) for n in cs]
28+
return dict(zip(bs, cs))
29+
30+
def get_pairs(word):
31+
"""Return set of symbol pairs in a word.
32+
33+
Word is represented as tuple of symbols (symbols being variable-length strings).
34+
"""
35+
pairs = set()
36+
prev_char = word[0]
37+
for char in word[1:]:
38+
pairs.add((prev_char, char))
39+
prev_char = char
40+
return pairs
41+
42+
class Encoder:
43+
def __init__(self, encoder, bpe_merges, errors='replace'):
44+
self.encoder = encoder
45+
self.decoder = {v:k for k,v in self.encoder.items()}
46+
self.errors = errors # how to handle errors in decoding
47+
self.byte_encoder = bytes_to_unicode()
48+
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
49+
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
50+
self.cache = {}
51+
52+
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
53+
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
54+
55+
def bpe(self, token):
56+
if token in self.cache:
57+
return self.cache[token]
58+
word = tuple(token)
59+
pairs = get_pairs(word)
60+
61+
if not pairs:
62+
return token
63+
64+
while True:
65+
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
66+
if bigram not in self.bpe_ranks:
67+
break
68+
first, second = bigram
69+
new_word = []
70+
i = 0
71+
while i < len(word):
72+
try:
73+
j = word.index(first, i)
74+
new_word.extend(word[i:j])
75+
i = j
76+
except:
77+
new_word.extend(word[i:])
78+
break
79+
80+
if word[i] == first and i < len(word)-1 and word[i+1] == second:
81+
new_word.append(first+second)
82+
i += 2
83+
else:
84+
new_word.append(word[i])
85+
i += 1
86+
new_word = tuple(new_word)
87+
word = new_word
88+
if len(word) == 1:
89+
break
90+
else:
91+
pairs = get_pairs(word)
92+
word = ' '.join(word)
93+
self.cache[token] = word
94+
return word
95+
96+
def encode(self, text):
97+
bpe_tokens = []
98+
for token in re.findall(self.pat, text):
99+
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
100+
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
101+
return bpe_tokens
102+
103+
def decode(self, tokens):
104+
text = ''.join([self.decoder[token] for token in tokens])
105+
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
106+
return text
107+
108+
def get_encoder(model_name):
109+
with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f:
110+
encoder = json.load(f)
111+
with open(os.path.join('models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
112+
bpe_data = f.read()
113+
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
114+
return Encoder(
115+
encoder=encoder,
116+
bpe_merges=bpe_merges,
117+
)

0 commit comments

Comments
 (0)