Add label to processes denoting expected speed

4ment · 4ment · commit e1894638135c · 2022-03-15T14:16:25.000+11:00
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 work
 .DS_Store
 autodiff-experiments.iml
+.Rproj.user
diff --git a/README.md b/README.md
@@ -20,6 +20,14 @@ the gradient using the [BEAGLE] library. BITO is only available in treeflow and
 You will need to install [nextflow](https://www.nextflow.io) and [docker](https://www.docker.com) to run this benchmark.
 Docker is not required but it is highly recommended to use it due to the numerous dependencies.
 
+## Installation
+
+    git clone 4ment/autodiff-experiments.git
+
+### Initialize treetime_validation
+
+    git submodule update --init --recursive
+
 ## Running the pipeline with docker
 
     nextflow run 4ment/autodiff-experiments -profile docker
diff --git a/bin/physher-parser.py b/bin/physher-parser.py
diff --git a/main.nf b/main.nf
@@ -2,15 +2,21 @@
 
 nextflow.enable.dsl = 2
 
+params.reuse = false
 params.results = "results"
 params.enable_beast = false
-params.subtrees_alignment = "$baseDir/treetime_validation/resources/flu_H3N2/H3N2_HA_2011_2013.fasta"
 
 include { treetime_validation } from "./modules/treetime_validation.nf" addParams(base: "$baseDir/treetime_validation")
 include { micro } from "./modules/micro.nf"
 include { macro_flu } from "./modules/macro_flu.nf"
 
+dataset = "${baseDir}/treetime_validation/flu_H3N2/subtree_samples/dataset"
+subtrees_alignment = "$baseDir/treetime_validation/resources/flu_H3N2/H3N2_HA_2011_2013.fasta"
+
+
 process RUN_LSD {
+  label 'ultrafast'
+
   input:
   tuple val(size),
           val(rep),
@@ -32,6 +38,8 @@ process RUN_LSD {
 }
 
 process CONVERT_LSD_NEXUS_TO_NEWICK {
+  label 'ultrafast'
+
   input:
   tuple val(size), val(rep), path(lsd_nexus)
   output:
@@ -55,6 +63,7 @@ def group_per_size_rep(newick_ch, create_sub_ch) {
 }
 
 process CREATE_SUB_FILES {
+  label 'ultrafast'
 
   input:
   tuple val(size), val(rep), path(lsd_dates), path(newick_file)
@@ -66,7 +75,7 @@ process CREATE_SUB_FILES {
           path("H3N2_HA_2011_2013_${size}_${rep}.lsd_dates.new.txt")
   """
   helper.py 0 \
-            $params.subtrees_alignment \
+            $subtrees_alignment \
             $lsd_dates \
             $newick_file \
             H3N2_HA_2011_2013_${size}_${rep}.new.nwk \
@@ -77,9 +86,21 @@ process CREATE_SUB_FILES {
 
 
 workflow {
-  treetime_validation()
-
-  CREATE_SUB_FILES(treetime_validation.out)
+  if (params.reuse) {
+    subsets_ch = Channel.of(20, 50, 100, 200, 500, 750, 1000, 1250, 1500, 2000)
+    replicates_ch = Channel.of(0..5)
+    ch = subsets_ch.combine(replicates_ch)
+    tt_ch = ch.map {
+      tuple(it[0], it[1],
+              file("${dataset}/LSD_out/H3N2_HA_2011_2013_${it[0]}_${it[1]}.lsd_dates.txt"),
+              file("${dataset}/subtrees/H3N2_HA_2011_2013_${it[0]}_${it[1]}.nwk"))
+    }
+  } else {
+    treetime_validation()
+    tt_ch = treetime_validation.out
+  }
+
+  CREATE_SUB_FILES(tt_ch)
 
   RUN_LSD(CREATE_SUB_FILES.out)
 
diff --git a/modules/macro_flu.nf b/modules/macro_flu.nf
@@ -160,6 +160,37 @@ process RUN_TREEFLOW {
               -n ${params.iterations} > out.txt ; } 2> treeflow.${size}.${rep}.log
   """
 }
+
+process COMBIME_TIME_LOG {
+  publishDir "$params.results/macro/", mode: 'copy'
+
+  input:
+  path files
+  output:
+  path("macro.csv")
+
+  """
+  #!/usr/bin/env python
+  import re
+  
+  pattern_time = re.compile(r'Time: (\\d+\\.\\d+)')
+  with open('macro.csv', 'w') as fpo:
+      for file_path in ${files}:
+          with open(file_path, 'r') as fp:
+            for line in fp:
+                line = line.rstrip('\\n').rstrip('\\r')
+                mt = pattern_time.match(line)
+                if mt:
+                    total_time = mt.group(1)
+                    a = file_path.rstrip('.log').split('.')
+                    if a[0] == 'torchtree':
+                        if a[1] == 'true':
+                            a[0] = 'bitorch'
+                        del a[1]
+                    fpo.write(a.join(',') + '\n')
+  """
+}
+
 workflow macro_flu {
   take:
   data
diff --git a/modules/micro.nf b/modules/micro.nf
@@ -8,6 +8,8 @@ params.results = "results"
 phylox = Channel.of("torchtree", "bitorch", "phylojax")
 
 process RUN_PHYSHER_BENCHMARK {
+  label 'fast'
+
   publishDir "$params.results/micro/physher", mode: 'copy'
 
   input:
@@ -27,6 +29,7 @@ process RUN_PHYSHER_BENCHMARK {
 }
 
 process RUN_PHYLOX_BENCHMARK {
+  label 'normal'
   label 'bito'
 
   publishDir "$params.results/micro/${phylox}", mode: 'copy'
@@ -54,6 +57,7 @@ process RUN_PHYLOX_BENCHMARK {
 }
 
 process RUN_TREEFLOW_BENCHMARK {
+  label 'normal'
   label 'bito'
 
   publishDir "$params.results/micro/treeflow", mode: 'copy'
@@ -76,6 +80,8 @@ process RUN_TREEFLOW_BENCHMARK {
 }
 
 process COMBIME_CSV {
+  label 'ultrafast'
+
   publishDir "$params.results/micro/", mode: 'copy'
 
   input:
@@ -85,7 +91,7 @@ process COMBIME_CSV {
 
   """
   head -n1 ${files[0]} > micro.csv
-  tail -q -n+2 *.csv >> micro.csv
+  tail -q -n+2 *[0-9].csv >> micro.csv
   """
 }
 
diff --git a/nextflow.config b/nextflow.config
@@ -6,6 +6,7 @@ manifest {
   mainScript = 'main.nf'
 }
 
+executor.cpus = 1
 profiles {
     docker {
         process.container = '4ment/autodiff-experiments'

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ manifest {`
`6`	`6`	`mainScript = 'main.nf'`
`7`	`7`	`}`
`8`	`8`
	`9`	`+executor.cpus = 1`
`9`	`10`	`profiles {`
`10`	`11`	`docker {`
`11`	`12`	`process.container = '4ment/autodiff-experiments'`