ftlabs
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎README.md
+109 b/‎README.md
+109
diff --git a/‎src/__pycache__/application.cpython-37.pyc
2.5 KB b/‎src/__pycache__/application.cpython-37.pyc
2.5 KB
diff --git a/‎src/__pycache__/encoder.cpython-37.pyc
4.87 KB b/‎src/__pycache__/encoder.cpython-37.pyc
4.87 KB
diff --git a/‎src/__pycache__/flask-server.cpython-37.pyc
2.67 KB b/‎src/__pycache__/flask-server.cpython-37.pyc
2.67 KB
diff --git a/‎src/__pycache__/hello.cpython-37.pyc
668 Bytes b/‎src/__pycache__/hello.cpython-37.pyc
668 Bytes
diff --git a/‎src/__pycache__/model.cpython-37.pyc
6.56 KB b/‎src/__pycache__/model.cpython-37.pyc
6.56 KB
diff --git a/‎src/__pycache__/sample.cpython-37.pyc
2.52 KB b/‎src/__pycache__/sample.cpython-37.pyc
2.52 KB
diff --git a/‎src/application.py
+104 b/‎src/application.py
+104
diff --git a/‎src/download_model.py
+27 b/‎src/download_model.py
+27
diff --git a/‎src/encoder.py
+117 b/‎src/encoder.py
+117
@@ -0,0 +1,5 @@
+.DS_Store
+**/models
+**/venv
+**/.vscode
+
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,109 @@
+# GPT-2 Test
+
+## Intention
+
+To investigate the generated content provided by the GPT-2 117M learned model - [GPT-2 repo](https://github.com/openai/gpt-2.git).
+
+
+## Setup
+
++ Clone this repo
++ Open terminal and ```cd``` to the cloned repo location
+
+```
+brew install python
+pip3 install virtualenv
+cd src
+virtualenv venv
+source venv/bin/activate
+pip3 install Flask
+pip3 install flask-cors
+pip3 install tensorflow==1.13.1
+pip3 install -r requirements.txt
+python3 download_model.py 117M
+```
+
+## Running
+
+In terminal:
+
+```
+python3 application.py
+```
+
++ Go to [http://0.0.0.0:80](http://0.0.0.0:80)
++ Insert the text you want to use as a seed in the text field and hit submit
++ A ***waiting for repsonse...*** message will appear until the generated content is...uh...generated, then the content will replace the message
+
+
+## GPT-2 repo README
+
+This repo is based on the [original GPT-2 repo](https://github.com/openai/gpt-2.git), you can read the setup instructions in this repo's *GPT2-README.md*
+
+
+## Creating the live serve
+
+Tried to get this up on Heroku or a Lambda but both have storage size limits which are too small for the 500MB model + repo code and requirements.
+
++ Heroku - 500MB
++ Lambda - 250MB
+
+So had to settle for a custom EC2 build.
+
+
+### AWS EC2 instance
+
++ **Step 1**
+    - Amazon Linux 2 AMI
++ **Step 2**
+    - Select t2.medium
++ **Step 3:**
+    - Auto-assign Public IP to *Enable*
+    - Otherwise Default settings
++ **Step 4**
+    - Default settings
++ **Step 5**
+    - TODO: what tags does Labs use
++ **Step 6**
+     - TODO: what groups does Labs use to give SSH/HTTP/HTTPS access
++ **Step 7**
+    - Create using new key or ley you already have access to 
+
+
+### Installing the repo and requirements
+
+```
+sudo yum install python3
+sudo yum install git
+sudo yum install gcc
+sudo yum install python3-devel
+git clone https://github.com/ftlabs/gpt2-gui.git
+cd ~/gpt2-gui/src
+sudo pip3 install Flask
+sudo pip3 install flask-cors
+sudo pip3 install tensorflow==1.13.1
+sudo pip3 install -r requirements.txt
+sudo python3 download_model.py 117M
+```
+
+### Running
+
+***Test run***
+
+```
+cd ~/gpt2-gui/src
+sudo python3 application.py
+```
+
+***Keep running***
+
+```
+
+```
+
+
+## Appendix
+
++ [Deploying flask to AWS tutorial](https://www.codementor.io/dushyantbgs/deploying-a-flask-application-to-aws-gnva38cf0)
+
+
@@ -0,0 +1,104 @@
+from flask import Flask, jsonify, request, render_template
+from flask_cors import CORS
+import fire
+import json
+import os
+import numpy as np
+import tensorflow as tf
+
+import model, sample, encoder
+
+app = Flask(__name__)
+CORS(app)
+
+@app.route("/")
+def include_example():
+	return render_template("index.html")
+
+@app.route("/gtp2", methods = ['POST'])
+def convert():
+    json = request.get_json()
+    if 'seed' in json.keys():
+        print(json)
+        model_result = interact_model(json['seed'], float(json['temperature']), json['top_k'])
+        return jsonify(data=model_result)
+    return 'error'
+    
+def interact_model(raw_text, temperature, top_k):
+    model_name='117M'
+    seed=None
+    nsamples=1
+    batch_size=1
+    length=None
+    #temperature=0.7
+    #top_k=40
+
+    """
+    Interactively run the model
+    :model_name=117M : String, which model to use
+    :seed=None : Integer seed for random number generators, fix seed to reproduce
+     results
+    :nsamples=1 : Number of samples to return total
+    :batch_size=1 : Number of batches (only affects speed/memory).  Must divide nsamples.
+    :length=None : Number of tokens in generated text, if None (default), is
+     determined by model hyperparameters
+    :temperature=1 : Float value controlling randomness in boltzmann
+     distribution. Lower temperature results in less random completions. As the
+     temperature approaches zero, the model will become deterministic and
+     repetitive. Higher temperature results in more random completions.
+    :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
+     considered for each step (token), resulting in deterministic completions,
+     while 40 means 40 words are considered at each step. 0 (default) is a
+     special setting meaning no restrictions. 40 generally is a good value.
+    """
+    if batch_size is None:
+        batch_size = 1
+    assert nsamples % batch_size == 0
+
+    enc = encoder.get_encoder(model_name)
+    hparams = model.default_hparams()
+    with open(os.path.join('models', model_name, 'hparams.json')) as f:
+        hparams.override_from_dict(json.load(f))
+
+    if length is None:
+        length = hparams.n_ctx // 2
+    elif length > hparams.n_ctx:
+        raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
+
+    with tf.Session(graph=tf.Graph()) as sess:
+        context = tf.placeholder(tf.int32, [batch_size, None])
+        np.random.seed(seed)
+        tf.set_random_seed(seed)
+        output = sample.sample_sequence(
+            hparams=hparams, length=length,
+            context=context,
+            batch_size=batch_size,
+            temperature=temperature, top_k=top_k
+        )
+
+        saver = tf.train.Saver()
+        ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name))
+        saver.restore(sess, ckpt)
+
+        while True:
+            #raw_text = input("Model prompt >>> ")
+            #while not raw_text:
+                #print('Prompt should not be empty!')
+                #raw_text = input("Model prompt >>> ")
+            context_tokens = enc.encode(raw_text)
+            generated = 0
+            for _ in range(nsamples // batch_size):
+                out = sess.run(output, feed_dict={
+                    context: [context_tokens for _ in range(batch_size)]
+                })[:, len(context_tokens):]
+                for i in range(batch_size):
+                    generated += 1
+                    text = enc.decode(out[i])
+                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
+                    print(text)
+                    return text
+            print("=" * 80)
+
+if __name__ == '__main__':
+    #fire.Fire(interact_model)
+    app.run(host="0.0.0.0", port=80)
@@ -0,0 +1,27 @@
+import os
+import sys
+import requests
+from tqdm import tqdm
+
+if len(sys.argv) != 2:
+    print('You must enter the model name as a parameter, e.g.: download_model.py 117M')
+    sys.exit(1)
+
+model = sys.argv[1]
+
+subdir = os.path.join('models', model)
+if not os.path.exists(subdir):
+    os.makedirs(subdir)
+
+for filename in ['checkpoint','encoder.json','hparams.json','model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe']:
+
+    r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir + "/" + filename, stream=True)
+
+    with open(os.path.join(subdir, filename), 'wb') as f:
+        file_size = int(r.headers["content-length"])
+        chunk_size = 1000
+        with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
+            # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
+            for chunk in r.iter_content(chunk_size=chunk_size):
+                f.write(chunk)
+                pbar.update(chunk_size)
@@ -0,0 +1,117 @@
+"""Byte pair encoding utilities"""
+
+import os
+import json
+import regex as re
+from functools import lru_cache
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+class Encoder:
+    def __init__(self, encoder, bpe_merges, errors='replace'):
+        self.encoder = encoder
+        self.decoder = {v:k for k,v in self.encoder.items()}
+        self.errors = errors # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+
+def get_encoder(model_name):
+    with open(os.path.join('models', model_name, 'encoder.json'), 'r') as f:
+        encoder = json.load(f)
+    with open(os.path.join('models', model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
+        bpe_data = f.read()
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    return Encoder(
+        encoder=encoder,
+        bpe_merges=bpe_merges,
+    )