diff --git a/examples/job/train-vae.ipynb b/examples/job/train-vae.ipynb index b2f30828..65981251 100644 --- a/examples/job/train-vae.ipynb +++ b/examples/job/train-vae.ipynb @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e5d0b36cbad00fc5f4950935f1e6ec687e8f2abf5598ceb7f4c9d13f7bf449 -size 30363 +oid sha256:889592f13e37cd9382dde102731a6ca9c36c414ae02a99611f25d45b3390e30c +size 43800 diff --git a/examples/job/train-vae.py b/examples/job/train-vae.py index f6407dc8..5f6919de 100644 --- a/examples/job/train-vae.py +++ b/examples/job/train-vae.py @@ -4,7 +4,7 @@ # In[]: -get_ipython().system('pip install tensorflow rdkit_pypi pymonad tqdm deepchem') +get_ipython().system('pip install tensorflow rdkit_pypi pymonad tqdm deepchem pillow') # In[]: @@ -23,6 +23,7 @@ import rdkit.Chem import tqdm +rdkit.RDLogger.DisableLog('rdApp.*') RANDOM_SEED = 42 tqdm.tqdm.pandas() @@ -33,6 +34,7 @@ data = pd.read_csv("assets/curated-solubility-dataset.csv") +data.to_pickle("vae_data.pkl") # Drop duplicates based on the InChI representation nondupes = data.drop_duplicates(subset="InChI") @@ -43,7 +45,7 @@ # # Data Augmentation -# In[ ]: +# In[]: augmented_data = list(smiles.copy()) @@ -74,7 +76,7 @@ def augment_smiles(smiles_string: str): # # Tokenization with DeepChem -# In[ ]: +# In[]: # DeepChem's SmileTokenizer uses the WordPiece transformer by HuggingFace (https://huggingface.co/transformers/tokenizer_summary.html), with the regular expression SMILES tokenization strategy developed by Schwaller, P. et al in https://doi.org/10.1039/c8sc02339e @@ -93,7 +95,7 @@ def augment_smiles(smiles_string: str): tokenized_smiles -# In[ ]: +# In[]: # Next up, we'll remove SMILES with any unknown characters, since we don't want our generator putting those tokens in the output @@ -108,13 +110,13 @@ def augment_smiles(smiles_string: str): # # Create the VAE -# In[ ]: +# In[]: from tensorflow.keras.layers import Input, Dense, Conv1D, Layer, Flatten, Reshape, Conv1DTranspose -# In[ ]: +# In[]: class Sampling(Layer): @@ -129,7 +131,7 @@ def call(self, inputs): return z_mean + tf.exp(0.5 * z_log_var) * epsilon -# In[ ]: +# In[]: latent_dim = 1 @@ -147,7 +149,7 @@ def call(self, inputs): encoder.summary() -# In[ ]: +# In[]: latent_inputs = Input(shape=(latent_dim,)) @@ -161,7 +163,7 @@ def call(self, inputs): decoder.summary() -# In[ ]: +# In[]: class VAE(tf.keras.Model): @@ -209,21 +211,22 @@ def train_step(self, data): # # Train the VAE -# In[ ]: +# In[]: # Reshape the data as needed, scale between 0 and 1 train_data = np.array([i for i in tokenized_smiles]) train_data = np.expand_dims(train_data, -1).astype("float32") / maxlen -train_data.reshape(1,312,-1).shape +train_data = train_data.reshape(-1,maxlen,1) +train_data.shape -# In[ ]: +# In[]: vae = VAE(encoder, decoder) vae.compile(optimizer=tf.keras.optimizers.Adam()) -vae.fit(train_data, epochs=1000, batch_size=64) +vae.fit(train_data, epochs=100, batch_size=64) # In[ ]: @@ -254,8 +257,33 @@ def invert_tokenization(tokens): # In[ ]: -predictions = (vae.decoder.predict([np.random.uniform(low=-10, high=10, size=100)])[:,:,0] * maxlen).astype(int) -list(map(lambda tokenized: invert_tokenization(tokenized), predictions)) +predictions = (vae.decoder.predict([np.linspace(-100, 1000, 50)])[:,:,0] * maxlen).astype(int) +pred_smiles = list(map(lambda tokenized: invert_tokenization(tokenized), predictions)) + + +# In[ ]: + + +from rdkit.Chem.Draw import IPythonConsole +from rdkit.Chem.Draw.MolDrawing import MolDrawing +pred_mols = [] +for i in pred_smiles: + mol = rdkit.Chem.MolFromSmiles(i) + if mol: + pred_mols.append(mol) + + +# In[ ]: + + +img = rdkit.Chem.Draw.MolsToGridImage(pred_mols, subImgSize=[3000,3000]) +img + + +# In[ ]: + + + # In[ ]: diff --git a/examples/vae_data.pkl b/examples/vae_data.pkl new file mode 100644 index 00000000..d6dd6e5f Binary files /dev/null and b/examples/vae_data.pkl differ