Skip to content

Commit 57760db

Browse files
committed
first commit
0 parents  commit 57760db

File tree

5 files changed

+381
-0
lines changed

5 files changed

+381
-0
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "treetime_validation"]
2+
path = treetime_validation
3+
url = https://github.com/neherlab/treetime_validation

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# autodiff-experiments

SConstruct

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
import os
2+
import re
3+
from os.path import join
4+
from subprocess import check_call
5+
from timeit import default_timer as timer
6+
7+
from dendropy import DnaCharacterMatrix
8+
9+
10+
env = Environment(ENV=os.environ)
11+
12+
N_leaves_array = [20, 50, 100, 200, 500, 750, 1000, 1250, 1500, 2000]
13+
iterations = ARGUMENTS.get('replicates', '5000')
14+
15+
treetime_base = 'treetime_validation'
16+
flu_H3N2_dataset = join(treetime_base, 'flu_H3N2', 'subtree_samples', 'dataset')
17+
dates_dir = join(flu_H3N2_dataset, 'LSD_out')
18+
subtrees_dir = join(flu_H3N2_dataset, 'subtrees')
19+
20+
21+
alignment_file = join(treetime_base, 'resources', 'flu_H3N2', 'H3N2_HA_2011_2013.fasta')
22+
tree_file = join(treetime_base, 'resources', 'flu_H3N2', 'H3N2_HA_2011_2013.nwk')
23+
24+
subdata_dir = join('flu_H3N2', 'results')
25+
json_file = join('flu_H3N2', 'H3N2_HA_2011_2013.json')
26+
27+
dna = DnaCharacterMatrix.get(path=alignment_file, schema='fasta')
28+
29+
if not os.path.lexists(subdata_dir):
30+
os.makedirs(subdata_dir, exist_ok=True)
31+
32+
33+
def prepare_physher(target, source, env):
34+
subfasta_file, subtree_file, dates_file = map(str, source)
35+
output_tree = subfasta_file.replace('.fasta', '.physher.tree')
36+
dates = read_dates(dates_file)
37+
with open(json_file, 'r') as fp:
38+
content = fp.read()
39+
dates_json = ',\n'.join(['"{}_{}":{}'.format(taxon, date, date) for taxon, date in dates.items()])
40+
dates_json = dates_json.rstrip(',\n')
41+
content = content.replace('FILE_TEMPLATE', subfasta_file).replace('DATES_TEMPLATE', dates_json).replace(
42+
'TREE_TEMPLATE', subtree_file).replace('DIM_TEMPLATE', str(len(dates) - 1)).replace('OUTPUT_TEMPLATE',
43+
output_tree).replace(
44+
'TEMPLATE_ITER', iterations)
45+
with open(str(target[0]), 'w') as fp:
46+
fp.write(content)
47+
48+
49+
def run_physher(target, source, env):
50+
start = timer()
51+
f = open(str(target[0]), 'w')
52+
check_call(['physher', str(source[0])], stdout=f)
53+
end = timer()
54+
total_time = end - start
55+
f.write('TIME: {}'.format(total_time))
56+
f.close()
57+
58+
59+
def run_phylostan(target, source, env):
60+
seq_file, tree_file, script = map(str, source)
61+
start = timer()
62+
f = open(str(target[0]), 'w')
63+
check_call(['phylostan', 'run', '-i', seq_file, '-t', tree_file, '-s', script, '-o', seq_file + '.stan', '--iter',
64+
env['iter'], '--eta', '0.00000001', '-m', 'JC69', '--heterochronous', '--estimate_rate', '--clock',
65+
'strict', '-c', 'constant', '--elbo_samples', '1', '--samples', '1'], stdout=f)
66+
end = timer()
67+
total_time = end - start
68+
f.write('TIME: {}'.format(total_time))
69+
f.close()
70+
71+
72+
def run_phylotorch(target, source, env):
73+
start = timer()
74+
f = open(str(target[0]), 'w')
75+
cmd = ['phylotorch', '-i', str(source[0]), '-t', str(source[1]), '--iter', env['iter'], '--eta', '0.00000001', '-m',
76+
'JC69', '--heterochronous', '-c', 'constant', '--elbo_samples', '1']
77+
if 'libsbn' in env:
78+
cmd.append('--libsbn')
79+
check_call(cmd, stdout=f)
80+
end = timer()
81+
total_time = end - start
82+
f.write('TIME: {}'.format(total_time))
83+
f.close()
84+
85+
86+
def run_phylojax(target, source, env):
87+
start = timer()
88+
f = open(str(target[0]), 'w')
89+
check_call(['phylojax', '-i', str(source[0]), '-t', str(source[1]), '--iter', env['iter'], '--eta', '0.00000001',
90+
'--elbo_samples', '1'])
91+
end = timer()
92+
total_time = end - start
93+
f.write('TIME: {}'.format(total_time))
94+
f.close()
95+
96+
97+
def run_lsd(target, source, env):
98+
start = timer()
99+
f = open(str(target[0]), 'w')
100+
cmd = ['lsd', '-i', str(source[0]), '-d', str(source[1]), '-o',
101+
str(source[1]).replace('_dates.txt', '.out'), '-s', '1701', '-c']
102+
check_call(cmd)
103+
end = timer()
104+
total_time = end - start
105+
f.write('TIME: {}'.format(total_time))
106+
f.close()
107+
108+
109+
def read_dates(dates_file):
110+
dates_dic = {}
111+
with open(dates_file, 'r') as fp:
112+
for line in fp:
113+
l = line.rstrip().split('\t')
114+
if len(l) == 2:
115+
dates_dic[l[0]] = l[1]
116+
return dates_dic
117+
118+
119+
def parse_results(target, source, env):
120+
pattern_time = re.compile(r'TIME: (\d+\.\d+)')
121+
csvp = open(str(target[0]), 'w')
122+
csvp.write('method,replicate,taxa,time\n')
123+
124+
for infile in source:
125+
total_time = -1
126+
stem, method = str(infile).split('.')
127+
temp = stem.split('_')
128+
with open(str(infile), "r") as fp:
129+
for line in fp:
130+
if line.startswith("#"):
131+
continue
132+
line = line.rstrip('\n').rstrip('\r')
133+
mtime = pattern_time.match(line)
134+
if mtime:
135+
total_time = mtime.group(1)
136+
break
137+
138+
csvp.write('{},{},{},{}\n'.format(method, temp[-1], temp[-2], total_time))
139+
csvp.close()
140+
141+
142+
for n_taxa in N_leaves_array:
143+
dates_file = join(dates_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa))
144+
subtree_file = join(subtrees_dir, 'H3N2_HA_2011_2013_{}_0.nwk'.format(n_taxa))
145+
146+
dates_dic = read_dates(dates_file)
147+
148+
# clean up comments and add dates to end of taxon names
149+
with open(subtree_file, 'r') as fp:
150+
content = fp.read().replace('None', '')
151+
content = re.sub('NODE_\d+', '', content)
152+
for taxon, date in dates_dic.items():
153+
content = content.replace(taxon, taxon + '_' + date)
154+
155+
subtree_dates_file = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.nwk'.format(n_taxa))
156+
subfasta_file = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.fasta'.format(n_taxa))
157+
158+
with open(subtree_dates_file, 'w') as fp:
159+
fp.write(content)
160+
161+
# add dates to end of sequence names
162+
sub_aln_dic = {}
163+
for taxon, date in dates_dic.items():
164+
t = dna.taxon_namespace.get_taxon(label=taxon)
165+
new_taxon_name = taxon + '_' + date
166+
sub_aln_dic[new_taxon_name] = str(dna[t])
167+
sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic)
168+
sub_dna.write(path=subfasta_file, schema="fasta")
169+
170+
new_dates_file = os.path.join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa))
171+
with open(new_dates_file, 'w') as fp:
172+
fp.write(str(len(dates_dic)))
173+
for taxon, date in dates_dic.items():
174+
fp.write('\n' + taxon + '_' + date + '\t' + date)
175+
176+
stan_script_file = join(subdata_dir, 'H3N2_HA_2011_2013.stan'.format(n_taxa))
177+
stan_pkl = stan_script_file.replace('.stan', '.pkl')
178+
179+
env.Command([stan_script_file, stan_pkl], None,
180+
'phylostan build -s $TARGET -m JC69 --heterochronous --estimate_rate --clock strict -c constant --compile')
181+
182+
outputs = []
183+
184+
for n_taxa in N_leaves_array:
185+
stem = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0'.format(n_taxa))
186+
dates_file = join(dates_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa))
187+
subtree_dates_file = stem + '.nwk' # branch=subst
188+
189+
lsd_stem = stem + '.lsd.out'
190+
lsd_tree_dated = lsd_stem + '.date.nexus' # branch=time
191+
lsd_dates = stem + '.lsd_dates.txt'
192+
193+
physher_tree_dated = stem + '.physher.tree'
194+
195+
seq_file = stem + '.fasta'
196+
subjson_file = stem + '.json'
197+
198+
env.Command(target=[stem + '.lsd', lsd_tree_dated],
199+
source=[subtree_dates_file, lsd_dates],
200+
action=run_lsd)
201+
202+
env.Command(target=subjson_file,
203+
source=[seq_file, lsd_tree_dated, dates_file],
204+
action=prepare_physher)
205+
206+
env.Command(target=[stem + '.physher', physher_tree_dated],
207+
source=subjson_file,
208+
action=run_physher, iter=iterations)
209+
210+
env.Command(target=stem + '.phylotorch',
211+
source=[seq_file, lsd_tree_dated],
212+
action=run_phylotorch, iter=iterations)
213+
214+
env.Command(target=stem + '.libsbn',
215+
source=[seq_file, physher_tree_dated],
216+
action=run_phylotorch, iter=iterations, libsbn=True)
217+
218+
env.Command(target=stem + '.phylojax',
219+
source = [seq_file, physher_tree_dated],
220+
action = run_phylojax, iter=iterations)
221+
222+
env.Command(target=stem + '.phylostan',
223+
source=[seq_file, physher_tree_dated, stan_script_file],
224+
action=run_phylostan, iter=iterations)
225+
226+
outputs.extend([stem + '.' + m for m in ('physher', 'phylotorch', 'libsbn', 'phylojax', 'phylostan')])
227+
228+
env.Command(join('flu_H3N2', 'H3N2_HA_2011_2013.csv'), outputs, parse_results)

flu_H3N2/H3N2_HA_2011_2013.json

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
{
2+
"model":{
3+
"id":"posterior",
4+
"type" : "compound",
5+
"distributions":[
6+
{
7+
"id":"treelikelihood",
8+
"type": "treelikelihood",
9+
"reparameterized": true,
10+
"sse":true,
11+
"tipstates": false,
12+
"sitepattern":{
13+
"id":"patterns",
14+
"type":"sitepattern",
15+
"datatype":"nucleotide",
16+
"alignment":{
17+
"id": "seqs",
18+
"type": "alignment",
19+
"file":"FILE_TEMPLATE"
20+
}
21+
},
22+
"sitemodel":{
23+
"id": "sitemodel",
24+
"type": "sitemodel",
25+
"substitutionmodel":{
26+
"id":"sm",
27+
"type":"substitutionmodel",
28+
"model":"jc69",
29+
"datatype":"nucleotide",
30+
"frequencies":{
31+
"id":"freqs",
32+
"type":"Simplex",
33+
"values":[0.25,0.25,0.25,0.25]
34+
}
35+
}
36+
37+
},
38+
"tree":{
39+
"id":"tree",
40+
"type":"tree",
41+
"time": true,
42+
"parameters":"tree.distances",
43+
"heights":"tree.heights",
44+
"reparam": "tree.scalers",
45+
"dates":{
46+
DATES_TEMPLATE
47+
},
48+
"file":"TREE_TEMPLATE"
49+
},
50+
"branchmodel":{
51+
"id": "bm",
52+
"type": "branchmodel",
53+
"model": "strict",
54+
"tree": "&tree",
55+
"rate": {"id":"rate", "type":"parameter", "value":0.001, "lower":0}
56+
}
57+
},
58+
{
59+
"id":"prior",
60+
"type" : "compound",
61+
"distributions":[
62+
{
63+
"id":"coalescent",
64+
"type": "coalescent",
65+
"model": "constant",
66+
"parameters":{
67+
"n0":{"id":"n0", "type":"parameter", "value":10, "lower":0, "upper":"infinity"}
68+
},
69+
"tree":"&tree"
70+
},
71+
{
72+
"id":"priortheta",
73+
"type": "distribution",
74+
"distribution": "oneonx",
75+
"x": "&n0"
76+
}
77+
]
78+
}
79+
]
80+
},
81+
"varmodel":{
82+
"id":"varnormal",
83+
"type": "variational",
84+
"posterior": "&posterior",
85+
"elbosamples": 1,
86+
"gradsamples": 1,
87+
"distributions":[
88+
{
89+
"id": "block1",
90+
"type": "block",
91+
"distribution": "normal",
92+
"x": "%tree.scalers",
93+
"parameters":{
94+
"mu": {"id":"mu", "type":"parameter", "values": [0.5], "dimension": DIM_TEMPLATE},
95+
"sigma": {"id":"sigma", "type":"parameter", "values":[0.15], "lower": 0, "dimension": DIM_TEMPLATE}
96+
}
97+
},
98+
{
99+
"id": "block2",
100+
"type": "block",
101+
"distribution": "normal",
102+
"x": "&n0",
103+
"parameters":{
104+
"mu": {"id":"mu.theta", "type":"parameter", "value": 1.45},
105+
"sigma": {"id":"sigma.theta", "type":"parameter", "value":0.13, "lower": 0}
106+
}
107+
},
108+
{
109+
"id": "block3",
110+
"type": "block",
111+
"distribution": "normal",
112+
"x": "&rate",
113+
"parameters":{
114+
"mu": {"id":"mu.rate", "type":"parameter", "value": -5.6},
115+
"sigma": {"id":"sigma.rate", "type":"parameter", "value":0.07, "lower": 0}
116+
}
117+
}
118+
]
119+
},
120+
"physher":[
121+
{
122+
"id": "loo",
123+
"type": "logger",
124+
"models": "&tree",
125+
"tree": true,
126+
"internal": false,
127+
"file": "OUTPUT_TEMPLATE"
128+
},
129+
{
130+
"id":"sg",
131+
"algorithm":"sg",
132+
"type":"optimizer",
133+
"tol": 0.0000001,
134+
"eta": 0.001,
135+
"_model": "&var",
136+
"model": "&varnormal",
137+
"parameters": ["%mu", "%sigma", "&mu.rate", "&sigma.rate","&mu.theta", "&sigma.theta"],
138+
"max": TEMPLATE_ITER
139+
}
140+
,
141+
{
142+
"ignore":true,
143+
"id": "loo",
144+
"type": "logger",
145+
"parameters": ["%mu", "%sigma", "&mu.rate", "&sigma.rate","&mu.theta", "&sigma.theta"]
146+
}
147+
]
148+
}

treetime_validation

Submodule treetime_validation added at c976019

0 commit comments

Comments
 (0)