Skip to content

Commit 025a7b7

Browse files
committed
Add CONET data sims
1 parent 20d0b75 commit 025a7b7

File tree

6 files changed

+3157
-0
lines changed

6 files changed

+3157
-0
lines changed

Diff for: reproducibility/simulations/benchmark.smk

+232
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ CONET_OUTPUT = os.path.join(config["output"], "conet")
6464
CONET_GINKGO_OUTPUT = os.path.join(config["output"], "conet_ginkgo")
6565
CONET_TREE_OUTPUT = os.path.join(config["output"], "conet_tree_distances")
6666
CONET_GINKGO_TREE_OUTPUT = os.path.join(config["output"], "conet_ginkgo_tree_distances")
67+
CONET_CBS_OUTPUT = os.path.join(config["output"], "conet_cbs")
68+
CONET_CBS_TREE_OUTPUT = os.path.join(config["output"], "conet_cbs_tree_distances")
6769

6870
try:
6971
other_cnv_methods = config["other_methods"]
@@ -557,6 +559,79 @@ rule conet_ginkgo_inference:
557559
shell:
558560
"python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}"
559561

562+
rule conet_cbs_inference:
563+
params:
564+
script = config["conet"]["script"],
565+
n_nodes = n_nodes,
566+
scratch = config["conet"]["scratch"],
567+
mem = config["conet"]["mem"],
568+
time = config["conet"]["time"],
569+
script_inp = str(n_nodes)+"nodes_{rep_id}",
570+
intermediate_dir = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/',
571+
bin_path = config["conet"]["bin_path"],
572+
em_iters = config["conet"]["em_iters"],
573+
pt_iters = config["conet"]["pt_iters"],
574+
input:
575+
d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt',
576+
output:
577+
conet_inferred_cnvs = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_inferred.txt',
578+
conet_inferred_tree = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_tree.txt',
579+
conet_inferred_attachments = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbsattachments.txt'
580+
threads: 10
581+
# shell:
582+
# "python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}"
583+
run:
584+
real_ind = list(map(lambda x: x.start, list(tree.nodes)))
585+
real_ind.extend(list(map(lambda x: x.end, list(tree.nodes))))
586+
real_ind = list(set(real_ind))
587+
real_ind.sort()
588+
if wildcards.known_breakpoints:
589+
# Extract real breakpoints indices
590+
candidates = real_ind
591+
save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates)
592+
else:
593+
save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, [])
594+
subprocess.run(["Rscript", "CBS_MergeLevels.R", "--mincells=5",
595+
f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"])
596+
X = np.loadtxt(dirpath + "cbs_out", delimiter=",")
597+
candidates = []
598+
for i in range(X.shape[0]):
599+
if X[i, 4] == 1.0:
600+
candidates.append(i)
601+
save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates)
602+
print(f"Found {len(candidates)} breakpoint candidates...")
603+
604+
# Convert model data to CONET format
605+
data_converter = dc.DataConverter(dirpath + "counts_synthetic",
606+
delimiter=';',
607+
default_bin_length=1,
608+
event_length_normalizer=corr_reads.shape[1], # number of loci
609+
add_chromosome_ends=False,
610+
neutral_cn=2.0)
611+
data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False)
612+
613+
# Perform CONET inference
614+
conet = c.CONET(str(Path(conf.conet_binary_dir) / Path("CONET")))
615+
params = cp.CONETParameters(tree_structure_prior_k1=0.01,
616+
data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000,
617+
param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000,
618+
use_event_lengths_in_attachment=False,
619+
event_length_penalty_k0=1)
620+
conet.infer_tree(params)
621+
print(f"CONET inference finished on model {i}")
622+
result = ir.InferenceResult(conf.conet_binary_dir, corr_reads.T)
623+
624+
inferred_cn = result.get_inferred_copy_numbers(2, conf.bins, conf.cells)
625+
inferred_brkp = result.bp_matrix.astype(int)
626+
inferred_nodes = set(result.tree.nodes)
627+
inferred_edges = set(result.tree.edges)
628+
629+
real_cn = data[0]
630+
real_brkp = data[4][:, real_ind].astype(int)
631+
real_nodes = set((n.start, n.end) for n in tree.nodes)
632+
real_edges = set(((e[0].start, e[0].end), (e[1].start, e[1].end)) for e in tree.edges)
633+
634+
560635
rule run_medalt:
561636
params:
562637
script = config["medalt"]["script"],
@@ -610,3 +685,160 @@ rule compute_medalt_ginkgo_tree_distances:
610685
medalt_tree_distance = f'{MEDALT_TREE_OUTPUT}/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_medalt_ginkgo_tree_distance.txt'
611686
shell:
612687
"python {params.script} {input.medalt_tree} {input.cnvs} {output.medalt_tree_distance}"
688+
689+
rule compute_deltas:
690+
input:
691+
simulations = expand(f'{SIM_OUTPUT}_{sim_prefix}/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_d_mat.txt',
692+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]),
693+
best_full_tree = expand(f'{TREES_OUTPUT}_best_full_tree/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_full_tree.txt',
694+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], tree_rep_id=[x for x in range(0,tree_rep)]),
695+
696+
best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_cluster_tree.txt',
697+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]),
698+
699+
best_full_tree_sum = expand(f'{TREES_OUTPUT}_best_full_tree_sum/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_full_tree.txt',
700+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], tree_rep_id=[x for x in range(0,tree_rep)]),
701+
702+
best_cluster_tree_sum = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_cluster_tree.txt',
703+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]),
704+
705+
other_cnvs = expand(os.path.join(config["output"], '{method}') +'/'+ str(n_nodes) + 'nodes_' + '{regions}'+'regions_'+ '{reads}'+'reads'+ '/' + '{rep_id}' + '_{method}_inferred.txt',
706+
regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], method=other_cnv_methods),
707+
output:
708+
out_fname = os.path.join(output_path, 'deltas_sims.csv')
709+
run:
710+
rows = []
711+
rows.append('index,rep_id,method,delta')
712+
i = 0
713+
for true_cnvs in input.simulations:
714+
# True
715+
rep_id = true_cnvs.split('/')[-1].split('_')[0]
716+
gt = np.loadtxt(true_cnvs, delimiter=',')
717+
if gt.shape[0] == n_bins: # should be cells by bins
718+
gt = np.transpose(gt)
719+
720+
# Diploid
721+
i+=1
722+
method = 'diploid'
723+
inf = np.ones(gt.shape) * 2
724+
error = np.sqrt(np.mean((gt-inf)**2))
725+
rows.append(f'{i},{rep_id},{method},{error}')
726+
727+
# i += 1
728+
# method = 'ginkgo'
729+
# inf_cnvs = f'{GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_ginkgo_inferred.txt'
730+
# inf = np.loadtxt(inf_cnvs, delimiter=' ')
731+
# error = np.sqrt(np.mean((gt-inf)**2))
732+
# rows.append(f'{i},{rep_id},{method},{error}')
733+
#
734+
# i += 1
735+
# method = 'conet_ginkgo'
736+
# inf_cnvs = f'{CONET_GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_ginkgo_inferred.txt'
737+
# inf = np.loadtxt(inf_cnvs, delimiter=',')
738+
# error = np.sqrt(np.mean((gt-inf)**2))
739+
# rows.append(f'{i},{rep_id},{method},{error}')
740+
741+
# i += 1
742+
# method = 'conet_knownbp'
743+
# inf_cnvs = f'{CONET_KNOWNBP_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_knownbp_inferred.txt'
744+
# inf = np.loadtxt(inf_cnvs, delimiter=';').T
745+
# error = np.sqrt(np.mean((gt-inf)**2))
746+
# rows.append(f'{i},{rep_id},{method},{error}')
747+
748+
for other_cnvs_file in other_cnvs:
749+
i += 1
750+
method = other_cnvs_file.split('')
751+
inf = np.loadtxt(other_cnvs_file, delimiter=',')
752+
if inf.shape != gt.shape:
753+
inf = inf.T
754+
error = np.sqrt(np.mean((gt-inf)**2))
755+
rows.append(f'{i},{rep_id},{method},{error}')
756+
757+
for fairness in ['unfair', 'fair']:
758+
i += 1
759+
method = f'cluster_tree_{fairness}'
760+
inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv'
761+
inf = np.loadtxt(inf_cnvs, delimiter=',')
762+
error = np.sqrt(np.mean((gt-inf)**2))
763+
rows.append(f'{i},{rep_id},{method},{error}')
764+
765+
i += 1
766+
method = f'cluster_tree_sum_{fairness}'
767+
inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv'
768+
inf = np.loadtxt(inf_cnvs, delimiter=',')
769+
error = np.sqrt(np.mean((gt-inf)**2))
770+
rows.append(f'{i},{rep_id},{method},{error}')
771+
772+
i += 1
773+
method = f'full_tree_{fairness}'
774+
inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv'
775+
inf = np.loadtxt(inf_cnvs, delimiter=',')
776+
error = np.sqrt(np.mean((gt-inf)**2))
777+
rows.append(f'{i},{rep_id},{method},{error}')
778+
779+
i += 1
780+
method = f'full_tree_sum_{fairness}'
781+
inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv'
782+
inf = np.loadtxt(inf_cnvs, delimiter=',')
783+
error = np.sqrt(np.mean((gt-inf)**2))
784+
rows.append(f'{i},{rep_id},{method},{error}')
785+
786+
# i += 1
787+
# method = f'cluster_tree_{fairness}_knownbps'
788+
# inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv'
789+
# inf = np.loadtxt(inf_cnvs, delimiter=',')
790+
# error = np.sqrt(np.mean((gt-inf)**2))
791+
# rows.append(f'{i},{rep_id},{method},{error}')
792+
#
793+
# i += 1
794+
# method = f'cluster_tree_sum_{fairness}_knownbps'
795+
# inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv'
796+
# inf = np.loadtxt(inf_cnvs, delimiter=',')
797+
# error = np.sqrt(np.mean((gt-inf)**2))
798+
# rows.append(f'{i},{rep_id},{method},{error}')
799+
#
800+
# i += 1
801+
# method = f'full_tree_{fairness}_knownbps'
802+
# inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv'
803+
# inf = np.loadtxt(inf_cnvs, delimiter=',')
804+
# error = np.sqrt(np.mean((gt-inf)**2))
805+
# rows.append(f'{i},{rep_id},{method},{error}')
806+
#
807+
# i += 1
808+
# method = f'full_tree_sum_{fairness}_knownbps'
809+
# inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv'
810+
# inf = np.loadtxt(inf_cnvs, delimiter=',')
811+
# error = np.sqrt(np.mean((gt-inf)**2))
812+
# rows.append(f'{i},{rep_id},{method},{error}')
813+
814+
i += 1
815+
method = f'cluster_tree_{fairness}_adapted'
816+
inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv'
817+
inf = np.loadtxt(inf_cnvs, delimiter=',')
818+
error = np.sqrt(np.mean((gt-inf)**2))
819+
rows.append(f'{i},{rep_id},{method},{error}')
820+
821+
i += 1
822+
method = f'cluster_tree_sum_{fairness}_adapted'
823+
inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv'
824+
inf = np.loadtxt(inf_cnvs, delimiter=',')
825+
error = np.sqrt(np.mean((gt-inf)**2))
826+
rows.append(f'{i},{rep_id},{method},{error}')
827+
828+
i += 1
829+
method = f'full_tree_{fairness}_adapted'
830+
inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv'
831+
inf = np.loadtxt(inf_cnvs, delimiter=',')
832+
error = np.sqrt(np.mean((gt-inf)**2))
833+
rows.append(f'{i},{rep_id},{method},{error}')
834+
835+
i += 1
836+
method = f'full_tree_sum_{fairness}_adapted'
837+
inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv'
838+
inf = np.loadtxt(inf_cnvs, delimiter=',')
839+
error = np.sqrt(np.mean((gt-inf)**2))
840+
rows.append(f'{i},{rep_id},{method},{error}')
841+
842+
843+
with open(output.out_fname, 'w') as f:
844+
f.write('\n'.join(rows))

0 commit comments

Comments
 (0)