|
| 1 | +import os |
| 2 | +import re |
| 3 | +from os.path import join |
| 4 | +from subprocess import check_call |
| 5 | +from timeit import default_timer as timer |
| 6 | + |
| 7 | +from dendropy import DnaCharacterMatrix |
| 8 | + |
| 9 | + |
| 10 | +env = Environment(ENV=os.environ) |
| 11 | + |
| 12 | +N_leaves_array = [20, 50, 100, 200, 500, 750, 1000, 1250, 1500, 2000] |
| 13 | +iterations = ARGUMENTS.get('replicates', '5000') |
| 14 | + |
| 15 | +treetime_base = 'treetime_validation' |
| 16 | +flu_H3N2_dataset = join(treetime_base, 'flu_H3N2', 'subtree_samples', 'dataset') |
| 17 | +dates_dir = join(flu_H3N2_dataset, 'LSD_out') |
| 18 | +subtrees_dir = join(flu_H3N2_dataset, 'subtrees') |
| 19 | + |
| 20 | + |
| 21 | +alignment_file = join(treetime_base, 'resources', 'flu_H3N2', 'H3N2_HA_2011_2013.fasta') |
| 22 | +tree_file = join(treetime_base, 'resources', 'flu_H3N2', 'H3N2_HA_2011_2013.nwk') |
| 23 | + |
| 24 | +subdata_dir = join('flu_H3N2', 'results') |
| 25 | +json_file = join('flu_H3N2', 'H3N2_HA_2011_2013.json') |
| 26 | + |
| 27 | +dna = DnaCharacterMatrix.get(path=alignment_file, schema='fasta') |
| 28 | + |
| 29 | +if not os.path.lexists(subdata_dir): |
| 30 | + os.makedirs(subdata_dir, exist_ok=True) |
| 31 | + |
| 32 | + |
| 33 | +def prepare_physher(target, source, env): |
| 34 | + subfasta_file, subtree_file, dates_file = map(str, source) |
| 35 | + output_tree = subfasta_file.replace('.fasta', '.physher.tree') |
| 36 | + dates = read_dates(dates_file) |
| 37 | + with open(json_file, 'r') as fp: |
| 38 | + content = fp.read() |
| 39 | + dates_json = ',\n'.join(['"{}_{}":{}'.format(taxon, date, date) for taxon, date in dates.items()]) |
| 40 | + dates_json = dates_json.rstrip(',\n') |
| 41 | + content = content.replace('FILE_TEMPLATE', subfasta_file).replace('DATES_TEMPLATE', dates_json).replace( |
| 42 | + 'TREE_TEMPLATE', subtree_file).replace('DIM_TEMPLATE', str(len(dates) - 1)).replace('OUTPUT_TEMPLATE', |
| 43 | + output_tree).replace( |
| 44 | + 'TEMPLATE_ITER', iterations) |
| 45 | + with open(str(target[0]), 'w') as fp: |
| 46 | + fp.write(content) |
| 47 | + |
| 48 | + |
| 49 | +def run_physher(target, source, env): |
| 50 | + start = timer() |
| 51 | + f = open(str(target[0]), 'w') |
| 52 | + check_call(['physher', str(source[0])], stdout=f) |
| 53 | + end = timer() |
| 54 | + total_time = end - start |
| 55 | + f.write('TIME: {}'.format(total_time)) |
| 56 | + f.close() |
| 57 | + |
| 58 | + |
| 59 | +def run_phylostan(target, source, env): |
| 60 | + seq_file, tree_file, script = map(str, source) |
| 61 | + start = timer() |
| 62 | + f = open(str(target[0]), 'w') |
| 63 | + check_call(['phylostan', 'run', '-i', seq_file, '-t', tree_file, '-s', script, '-o', seq_file + '.stan', '--iter', |
| 64 | + env['iter'], '--eta', '0.00000001', '-m', 'JC69', '--heterochronous', '--estimate_rate', '--clock', |
| 65 | + 'strict', '-c', 'constant', '--elbo_samples', '1', '--samples', '1'], stdout=f) |
| 66 | + end = timer() |
| 67 | + total_time = end - start |
| 68 | + f.write('TIME: {}'.format(total_time)) |
| 69 | + f.close() |
| 70 | + |
| 71 | + |
| 72 | +def run_phylotorch(target, source, env): |
| 73 | + start = timer() |
| 74 | + f = open(str(target[0]), 'w') |
| 75 | + cmd = ['phylotorch', '-i', str(source[0]), '-t', str(source[1]), '--iter', env['iter'], '--eta', '0.00000001', '-m', |
| 76 | + 'JC69', '--heterochronous', '-c', 'constant', '--elbo_samples', '1'] |
| 77 | + if 'libsbn' in env: |
| 78 | + cmd.append('--libsbn') |
| 79 | + check_call(cmd, stdout=f) |
| 80 | + end = timer() |
| 81 | + total_time = end - start |
| 82 | + f.write('TIME: {}'.format(total_time)) |
| 83 | + f.close() |
| 84 | + |
| 85 | + |
| 86 | +def run_phylojax(target, source, env): |
| 87 | + start = timer() |
| 88 | + f = open(str(target[0]), 'w') |
| 89 | + check_call(['phylojax', '-i', str(source[0]), '-t', str(source[1]), '--iter', env['iter'], '--eta', '0.00000001', |
| 90 | + '--elbo_samples', '1']) |
| 91 | + end = timer() |
| 92 | + total_time = end - start |
| 93 | + f.write('TIME: {}'.format(total_time)) |
| 94 | + f.close() |
| 95 | + |
| 96 | + |
| 97 | +def run_lsd(target, source, env): |
| 98 | + start = timer() |
| 99 | + f = open(str(target[0]), 'w') |
| 100 | + cmd = ['lsd', '-i', str(source[0]), '-d', str(source[1]), '-o', |
| 101 | + str(source[1]).replace('_dates.txt', '.out'), '-s', '1701', '-c'] |
| 102 | + check_call(cmd) |
| 103 | + end = timer() |
| 104 | + total_time = end - start |
| 105 | + f.write('TIME: {}'.format(total_time)) |
| 106 | + f.close() |
| 107 | + |
| 108 | + |
| 109 | +def read_dates(dates_file): |
| 110 | + dates_dic = {} |
| 111 | + with open(dates_file, 'r') as fp: |
| 112 | + for line in fp: |
| 113 | + l = line.rstrip().split('\t') |
| 114 | + if len(l) == 2: |
| 115 | + dates_dic[l[0]] = l[1] |
| 116 | + return dates_dic |
| 117 | + |
| 118 | + |
| 119 | +def parse_results(target, source, env): |
| 120 | + pattern_time = re.compile(r'TIME: (\d+\.\d+)') |
| 121 | + csvp = open(str(target[0]), 'w') |
| 122 | + csvp.write('method,replicate,taxa,time\n') |
| 123 | + |
| 124 | + for infile in source: |
| 125 | + total_time = -1 |
| 126 | + stem, method = str(infile).split('.') |
| 127 | + temp = stem.split('_') |
| 128 | + with open(str(infile), "r") as fp: |
| 129 | + for line in fp: |
| 130 | + if line.startswith("#"): |
| 131 | + continue |
| 132 | + line = line.rstrip('\n').rstrip('\r') |
| 133 | + mtime = pattern_time.match(line) |
| 134 | + if mtime: |
| 135 | + total_time = mtime.group(1) |
| 136 | + break |
| 137 | + |
| 138 | + csvp.write('{},{},{},{}\n'.format(method, temp[-1], temp[-2], total_time)) |
| 139 | + csvp.close() |
| 140 | + |
| 141 | + |
| 142 | +for n_taxa in N_leaves_array: |
| 143 | + dates_file = join(dates_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa)) |
| 144 | + subtree_file = join(subtrees_dir, 'H3N2_HA_2011_2013_{}_0.nwk'.format(n_taxa)) |
| 145 | + |
| 146 | + dates_dic = read_dates(dates_file) |
| 147 | + |
| 148 | + # clean up comments and add dates to end of taxon names |
| 149 | + with open(subtree_file, 'r') as fp: |
| 150 | + content = fp.read().replace('None', '') |
| 151 | + content = re.sub('NODE_\d+', '', content) |
| 152 | + for taxon, date in dates_dic.items(): |
| 153 | + content = content.replace(taxon, taxon + '_' + date) |
| 154 | + |
| 155 | + subtree_dates_file = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.nwk'.format(n_taxa)) |
| 156 | + subfasta_file = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.fasta'.format(n_taxa)) |
| 157 | + |
| 158 | + with open(subtree_dates_file, 'w') as fp: |
| 159 | + fp.write(content) |
| 160 | + |
| 161 | + # add dates to end of sequence names |
| 162 | + sub_aln_dic = {} |
| 163 | + for taxon, date in dates_dic.items(): |
| 164 | + t = dna.taxon_namespace.get_taxon(label=taxon) |
| 165 | + new_taxon_name = taxon + '_' + date |
| 166 | + sub_aln_dic[new_taxon_name] = str(dna[t]) |
| 167 | + sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic) |
| 168 | + sub_dna.write(path=subfasta_file, schema="fasta") |
| 169 | + |
| 170 | + new_dates_file = os.path.join(subdata_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa)) |
| 171 | + with open(new_dates_file, 'w') as fp: |
| 172 | + fp.write(str(len(dates_dic))) |
| 173 | + for taxon, date in dates_dic.items(): |
| 174 | + fp.write('\n' + taxon + '_' + date + '\t' + date) |
| 175 | + |
| 176 | +stan_script_file = join(subdata_dir, 'H3N2_HA_2011_2013.stan'.format(n_taxa)) |
| 177 | +stan_pkl = stan_script_file.replace('.stan', '.pkl') |
| 178 | + |
| 179 | +env.Command([stan_script_file, stan_pkl], None, |
| 180 | + 'phylostan build -s $TARGET -m JC69 --heterochronous --estimate_rate --clock strict -c constant --compile') |
| 181 | + |
| 182 | +outputs = [] |
| 183 | + |
| 184 | +for n_taxa in N_leaves_array: |
| 185 | + stem = join(subdata_dir, 'H3N2_HA_2011_2013_{}_0'.format(n_taxa)) |
| 186 | + dates_file = join(dates_dir, 'H3N2_HA_2011_2013_{}_0.lsd_dates.txt'.format(n_taxa)) |
| 187 | + subtree_dates_file = stem + '.nwk' # branch=subst |
| 188 | + |
| 189 | + lsd_stem = stem + '.lsd.out' |
| 190 | + lsd_tree_dated = lsd_stem + '.date.nexus' # branch=time |
| 191 | + lsd_dates = stem + '.lsd_dates.txt' |
| 192 | + |
| 193 | + physher_tree_dated = stem + '.physher.tree' |
| 194 | + |
| 195 | + seq_file = stem + '.fasta' |
| 196 | + subjson_file = stem + '.json' |
| 197 | + |
| 198 | + env.Command(target=[stem + '.lsd', lsd_tree_dated], |
| 199 | + source=[subtree_dates_file, lsd_dates], |
| 200 | + action=run_lsd) |
| 201 | + |
| 202 | + env.Command(target=subjson_file, |
| 203 | + source=[seq_file, lsd_tree_dated, dates_file], |
| 204 | + action=prepare_physher) |
| 205 | + |
| 206 | + env.Command(target=[stem + '.physher', physher_tree_dated], |
| 207 | + source=subjson_file, |
| 208 | + action=run_physher, iter=iterations) |
| 209 | + |
| 210 | + env.Command(target=stem + '.phylotorch', |
| 211 | + source=[seq_file, lsd_tree_dated], |
| 212 | + action=run_phylotorch, iter=iterations) |
| 213 | + |
| 214 | + env.Command(target=stem + '.libsbn', |
| 215 | + source=[seq_file, physher_tree_dated], |
| 216 | + action=run_phylotorch, iter=iterations, libsbn=True) |
| 217 | + |
| 218 | + env.Command(target=stem + '.phylojax', |
| 219 | + source = [seq_file, physher_tree_dated], |
| 220 | + action = run_phylojax, iter=iterations) |
| 221 | + |
| 222 | + env.Command(target=stem + '.phylostan', |
| 223 | + source=[seq_file, physher_tree_dated, stan_script_file], |
| 224 | + action=run_phylostan, iter=iterations) |
| 225 | + |
| 226 | + outputs.extend([stem + '.' + m for m in ('physher', 'phylotorch', 'libsbn', 'phylojax', 'phylostan')]) |
| 227 | + |
| 228 | +env.Command(join('flu_H3N2', 'H3N2_HA_2011_2013.csv'), outputs, parse_results) |
0 commit comments