Skip to content

Commit 295c442

Browse files
committed
update master to v1.6
1 parent 5d48eee commit 295c442

6 files changed

+543
-36
lines changed

README.md

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
# Somaticwrapper version 1.5 #
2+
# Somaticwrapper version 1.6 #
33

44
Detect somatic variants from tumor and normal WXS based on HG38 reference. SomaticWrapper pipeline is a fully automated and modular software package designed for detection of somatic variants from tumor and normal exome data. It works on LSF job scheduler and can run multiple jobs in parallel. Multiple standard variant calling tools are included in the pipeline such as varscan2, strelka2, mutect1 and pindel.
55

@@ -15,6 +15,8 @@ Improvements compared to version 1.4:
1515

1616
3) Add exonic option to let users select whether to only output the exonic mutations or all mutations
1717

18+
4) Added low vaf rescue for genes listed in SMGs
19+
1820
If you want to run somaticwrapper for hg19 reference, you can git clone the withmutect branch.
1921

2022
## Install the third-party software ##
@@ -33,7 +35,7 @@ bam-readcount 0.7.4: https://github.com/genome/bam-readcount
3335

3436
Step1: Enter the directory where you downloaded somaticwrapper pipeline
3537

36-
Step2: Type the coommand line: perl somaticwrapper.pl --srg --step --sre --rdir --ref --log --q --mincovt --mincovn --minvaf --maxindsize --exonic
38+
Step2: Type the coommand line: perl somaticwrapper.pl --srg --step --sre --rdir --ref --log --q --mincovt --mincovn --minvaf --maxindsize --exonic --smg
3739

3840
rdir = full path of the folder holding files for this sequence run (user must provide)
3941

@@ -55,10 +57,12 @@ mincovn: minimum coverage for normal: default >=8
5557

5658
minvaf: minimum somatic vaf: default >=0.05
5759

58-
maxindsize: default <=100
60+
maxindsize: default <= 100
5961

6062
exonic: output exonic region: 1 Yes, 0 No, Default Yes
6163

64+
smg: smg gene list that escapes the 0.05 vaf cut-off
65+
6266
hg38: /gscmnt/gc2521/dinglab/mwyczalk/somatic-wrapper-data/image.data/A_Reference/GRCh38.d1.vd1.fa
6367

6468
[0] Run all steps

filter_mutect1.8.pl

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#!/usr/bin/perl
2+
3+
## normal <=2%
4+
### mutect1.8 filtering ###
5+
6+
## turn off minvaf cut-off, Apr 20, 2020 ##
7+
8+
use strict;
9+
use warnings;
10+
die unless @ARGV == 6;
11+
### samtools ##
12+
13+
my ($samtools,$f_m,$f_filter_out,$mincov_t,$mincov_n,$minvaf)=@ARGV;
14+
15+
#my $f_m=$run_dir."/mutect/mutect.raw.vcf";
16+
#my $f_filter_out=$run_dir."/mutect/mutect.filtered.vcf";
17+
18+
### minimum vaf for tumor 0.05, column 10 ###
19+
## maximum vaf for normal 0.02, column 11 ###
20+
21+
## turn off minvaf cut-off ##
22+
23+
### get the bam path ##
24+
25+
my @temp=split(/\//,$f_m);
26+
27+
my $path_d="/";
28+
29+
for(my $i=1;$i<scalar @temp-2; $i++)
30+
{
31+
$path_d.=$temp[$i]."/";
32+
}
33+
34+
my $f_bam_n=$path_d.$temp[-3].".N.bam";
35+
my $f_bam_t=$path_d.$temp[-3].".T.bam";
36+
37+
## read absolute path ##
38+
#my $f_bam_n_abs=`readlink -f $f_bam_n`;
39+
#my $f_bam_t_abs=`readlink -f $f_bam_t`;
40+
#chomp($f_bam_n_abs);
41+
#chomp($f_bam_t_abs);
42+
#print $f_bam_n_abs,"\n";
43+
#print $f_bam_t_abs,"\n";
44+
#my $last_bam_n_abs=(split(/\//,$f_bam_n_abs))[-1];
45+
#my $last_bam_t_abs=(split(/\//,$f_bam_t_abs))[-1];
46+
my $sn_n;
47+
my $sn_t;
48+
#foreach my $l (`$samtools view -H $f_bam_n`)
49+
# {
50+
# my $ltr=$l;
51+
# chomp($ltr);
52+
53+
# if($ltr=~/^\@RG/) {
54+
# my @temp2=split("\t",$ltr);
55+
# $sn_n=(split(/\:/,$temp2[-1]))[1];
56+
# last;
57+
# }
58+
# }
59+
60+
61+
#foreach my $l (`$samtools view -H $f_bam_t`)
62+
# {
63+
# my $ltr=$l;
64+
# chomp($ltr);
65+
# if($ltr=~/^\@RG/) {
66+
# my @temp2=split("\t",$ltr);
67+
# $sn_t=(split(/\:/,$temp2[-1]))[1];
68+
# last;
69+
# }
70+
# }
71+
72+
foreach my $l (`$samtools view -H $f_bam_n`)
73+
{
74+
my $ltr=$l;
75+
chomp($ltr);
76+
if($ltr=~/^\@RG/) {
77+
my @temp2=split("\t",$ltr);
78+
for(my $i=0;$i<scalar @temp2;$i++)
79+
{
80+
if($temp2[$i]=~/^SM/)
81+
{
82+
$sn_n=(split(/\:/,$temp2[$i]))[1];
83+
last;
84+
}
85+
}
86+
}
87+
}
88+
89+
foreach my $l (`$samtools view -H $f_bam_t`)
90+
{
91+
my $ltr=$l;
92+
chomp($ltr);
93+
if($ltr=~/^\@RG/) {
94+
my @temp2=split("\t",$ltr);
95+
for(my $i=0;$i<scalar @temp2;$i++)
96+
{
97+
if($temp2[$i]=~/^SM/)
98+
{
99+
$sn_t=(split(/\:/,$temp2[$i]))[1];
100+
last;
101+
}
102+
}
103+
}
104+
}
105+
106+
#print $sn_n,"\t",$sn_t,"\n";
107+
108+
#<STDIN>;
109+
my $min_vaf_somatic=$minvaf;
110+
my $max_vaf_germline=0.02;
111+
#my $min_coverage=$mincov;
112+
my $tumor_normal_order=-1;
113+
114+
open(IN,"<$f_m");
115+
open(OUT,">$f_filter_out");
116+
117+
while(<IN>)
118+
{
119+
my $l=$_;
120+
chomp($l);
121+
if($l=~/^#/) { #print OUT $l,"\n";
122+
if($l=~/^#CHROM/) {
123+
my @temphead=split("\t",$l);
124+
print OUT $temphead[0];
125+
for(my $i=1;$i<=8;$i++)
126+
{
127+
print OUT "\t",$temphead[$i];
128+
}
129+
130+
if($temphead[9] eq $sn_t && $temphead[10] eq $sn_n) { print OUT "\t","TUMOR","\t","NORMAL","\n"; $tumor_normal_order=1; }
131+
if($temphead[9] eq $sn_n && $temphead[10] eq $sn_t) { print OUT "\t","NORMAL","\t","TUMOR","\n"; $tumor_normal_order=0; }
132+
}
133+
else { print OUT $l,"\n"; }
134+
}
135+
136+
else {
137+
#print $tumor_normal_order,"\n";
138+
#<STDIN>;
139+
my @temp=split("\t",$l);
140+
if($tumor_normal_order==-1) { last; }
141+
my $tumor=$temp[9];
142+
my $normal=$temp[10];
143+
144+
if($tumor_normal_order==0) {
145+
$tumor=$temp[10];
146+
$normal=$temp[9];
147+
}
148+
149+
my @tempt=split(":",$tumor);
150+
my @tempn=split(":",$normal);
151+
my @readt=split(",",$tempt[1]);
152+
my @readn=split(",",$tempn[1]);
153+
154+
#print $l,"\n";
155+
#print $readn[0], "\t", $readn[1],"\n";
156+
#<STDIN>;
157+
#print $readt[0], "\t", $readt[1],"\n";
158+
159+
#### readn[0]: read count for ref allele in normal ##
160+
### readn[1]: read count for alt allele in normal ##
161+
#### readt[0]: read count for ref allele in tumor ##
162+
### readt[1]: read count for alt allele in tumor ##
163+
164+
my $tot_n=$readn[0]+$readn[1];
165+
my $tot_t=$readt[0]+$readt[1];
166+
167+
if($tot_n==0 || $tot_t==0) { next; }
168+
169+
my $vaf_t=$readt[1]/$tot_t;
170+
my $vaf_n=$readn[1]/$tot_n;
171+
#print $vaf_t,"\t",$vaf_n,"\n";
172+
## apply filtering ##
173+
if($temp[6] eq "PASS" && $vaf_n<=$max_vaf_germline && $tot_n>=$mincov_n && $tot_t>=$mincov_t)
174+
{
175+
print OUT $l,"\n";
176+
}
177+
}
178+
179+
}
180+
181+

generate_final_report.pl

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
chomp($dtr);
2424

2525
# my $f_maf=$run_dir."/".$dtr."/".$dtr.".checked.maf";
26-
my $f_maf=$run_dir."/".$dtr."/".$dtr.".withmutect.maf";
26+
my $f_maf=$run_dir."/".$dtr."/".$dtr.".withmutect.filtered.maf";
2727
if(-e $f_maf)
2828
{
2929
my $count=0;

somaticwrapper.pl

+48-32
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,16 @@
2020
## 08/19/19 ##
2121
## add a checking step for vcf before merging ##
2222

23+
## add smg recovering module, Apr 23, 2020 ##
24+
2325
#!/usr/bin/perl
2426
##!/gscmnt/gc2525/dinglab/rmashl/Software/perl/perl-5.22.0/bin/perl
2527
use strict;
2628
use warnings;
2729
#use POSIX;
2830
use Getopt::Long;
2931

30-
my $version = 1.5;
32+
my $version = 1.6;
3133

3234
#color code
3335
my $red = "\e[31m";
@@ -43,7 +45,7 @@
4345
Somatic variant calling pipeline
4446
Pipeline version: $version
4547
46-
$yellow Usage: perl $0 --srg --step --sre --rdir --ref --log --q --mincovt --mincovn --minvaf --maxindsize --exonic
48+
$yellow Usage: perl $0 --srg --step --sre --rdir --ref --log --q --mincovt --mincovn --minvaf --maxindsize --exonic --smg
4749
4850
$normal
4951
@@ -59,10 +61,11 @@
5961
<minvaf> minimum somatic vaf: default >=0.05
6062
<maxindsize> default <=100
6163
<exonic> output exonic region: 1 Yes, 0 No
64+
<smg> use smg list for calling
65+
hg38:/gscmnt/gc2521/dinglab/mwyczalk/somatic-wrapper-data/image.data/A_Reference/GRCh38.d1.vd1/GRCh38.d1.vd1.fa
6266
63-
hg38: /gscmnt/gc2521/dinglab/mwyczalk/somatic-wrapper-data/image.data/A_Reference/GRCh38.d1.vd1.fa
67+
lscc smg: /gscmnt/gc3027/dinglab/medseq/smg_database/smg.lscc.tsv
6468
65-
$red [0] Run all steps
6669
$green [1] Run streka
6770
$green [2] Run Varscan
6871
$green [3] Run Pindel
@@ -94,6 +97,7 @@
9497
my $run_dir="";
9598
my $log_dir="";
9699
my $h38_REF="";
100+
my $db_smg="";
97101
#my $ref_name="";
98102
my $chr_status=0;
99103
my $maxindsize=100;
@@ -109,6 +113,7 @@
109113
"exonic=i" => \$status_exonic,
110114
"rdir=s" => \$run_dir,
111115
"ref=s" => \$h38_REF,
116+
"smg=s" => \$db_smg,
112117
"log=s" => \$log_dir,
113118
"q=s" => \$q_name,
114119
"mincovt=i" => \$mincov_t,
@@ -243,7 +248,7 @@
243248
#&check_input_dir($run_dir);
244249
# start data processsing
245250

246-
if ($step_number < 12) {
251+
if ($step_number < 12 && $step_number>0) {
247252
#begin to process each sample
248253
for (my $i=0;$i<@sample_dir_list;$i++) {#use the for loop instead. the foreach loop has some problem to pass the global variable $sample_name to the sub functions
249254
$sample_name = $sample_dir_list[$i];
@@ -1594,7 +1599,7 @@ sub bsub_parse_mutect{
15941599
print PM "filtervcf=".$sample_full_path."/mutect1/mutect.raw.filtered.$chr.vcf\n";
15951600
print PM "filtervcfsnv=".$sample_full_path."/mutect1/mutect.filter.snv.$chr.vcf\n";
15961601
print PM "filtervcfindel=".$sample_full_path."/mutect1/mutect.filter.indel.$chr.vcf\n";
1597-
print PM " ".$run_script_path."filter_mutect1.7.pl $samtools/samtools \${rawvcf} \${filtervcf} $mincov_t $mincov_n $minvaf\n";
1602+
print PM " ".$run_script_path."filter_mutect1.8.pl $samtools/samtools \${rawvcf} \${filtervcf} $mincov_t $mincov_n $minvaf\n";
15981603
# print MUTECT "java \${JAVA_OPTS} -jar "."$gatkexe3 -T SelectVariants -R $h38_REF -V \${filtervcf} -o \${filtervcfsnv} -selectType SNP -selectType MNP"."\n";
15991604
# print MUTECT "java \${JAVA_OPTS} -jar "."$gatkexe3 -T SelectVariants -R $h38_REF -V \${filtervcf} -o \${filtervcfindel} -selectType INDEL"."\n";
16001605
print PM "java \${JAVA_OPTS} -jar "."$mutect1 -T SelectVariants -R $h38_REF -V \${filtervcf} -o \${filtervcfsnv} -selectType SNP -selectType MNP"."\n";
@@ -1781,49 +1786,60 @@ sub bsub_vcf_2_maf{
17811786
#print VARSCANP "#BSUB -q long\n";
17821787
#print MAF "#BSUB -q research-hpc\n";
17831788
#print MAF "#BSUB -w \"$hold_job_file\"","\n";
1784-
print MAF "F_VCF_1=".$sample_full_path."/merged.filtered.withmutect.vcf\n";
1785-
# print MAF "F_VCF_1=".$sample_full_path."/merged.withmutect.vcf\n";
1786-
# print MAF "F_VCF_2=".$sample_full_path."/".$sample_name.".withmutect.vcf\n";
1789+
1790+
print MAF "F_VCF_1=".$sample_full_path."/merged.withmutect.vcf\n";
1791+
print MAF "F_VCF_1_filtered=".$sample_full_path."/merged.filtered.withmutect.vcf\n";
17871792
print MAF "F_VCF_2=".$sample_full_path."/".$sample_name.".withmutect.vcf\n";
1793+
print MAF "F_VCF_2_filtered=".$sample_full_path."/".$sample_name.".withmutect.filtered.vcf\n";
17881794
print MAF "F_VEP_1=".$sample_full_path."/merged.VEP.withmutect.vcf\n";
1789-
print MAF "F_VEP_2=".$sample_full_path."/".$sample_name.".withmutect.vep.vcf\n";
1795+
print MAF "F_VEP_1_filtered=".$sample_full_path."/merged.VEP.withmutect.filtered.vcf\n";
1796+
print MAF "F_VEP_2=".$sample_full_path."/".$sample_name.".withmutect.vep.vcf\n";
1797+
print MAF "F_VEP_2_filtered=".$sample_full_path."/".$sample_name.".withmutect.filtered.vep.vcf\n";
17901798
print MAF "F_maf=".$sample_full_path."/".$sample_name.".withmutect.maf\n";
1799+
print MAF "F_maf_filtered=".$sample_full_path."/".$sample_name.".withmutect.filtered.maf\n";
17911800
print MAF "RUNDIR=".$sample_full_path."\n";
1792-
print MAF "F_log=".$sample_full_path."/vep.merged.withmutect.log"."\n";
1793-
1801+
1802+
print MAF "F_log=".$sample_full_path."/vep.merged.withmutect.log"."\n";
17941803
print MAF "cat > \${RUNDIR}/vep.merged.withmutect.input <<EOF\n";
1795-
print MAF "merged.vep.vcf = ./merged.filtered.withmutect.vcf\n";
1796-
# print MAF "merged.vep.vcf = ./merged.withmutect.vcf\n";
1804+
print MAF "merged.vep.vcf = ./merged.withmutect.vcf\n";
17971805
print MAF "merged.vep.output = ./merged.VEP.withmutect.vcf\n";
17981806
print MAF "merged.vep.vep_cmd = $vepannot\n";
17991807
print MAF "merged.vep.cachedir = $vepcache\n";
1800-
#print MERGE "merged.vep.reffasta = /gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/homo_sapiens/85_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa\n";
18011808
print MAF "merged.vep.reffasta = $f_ref_annot\n";
18021809
print MAF "merged.vep.assembly = GRCh38\n";
18031810
print MAF "EOF\n";
1804-
print MAF "rm \${F_log}\n";
1805-
# print MAF "merged.vep.vcf = ./merged.filtered.vcf\n";
1806-
# print MAF "merged.vep.output = ./merged.VEP.vcf\n";
1807-
# print MAF "merged.vep.vep_cmd = $vepcmd\n";
1808-
# print MAF "merged.vep.cachedir = $vepcache\n";
1809-
#print MERGE "merged.vep.reffasta = /gscmnt/gc2525/dinglab/rmashl/Software/bin/VEP/v85/cache/homo_sapiens/85_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa\n";
1810-
# print MAF "merged.vep.reffasta = $f_ref_annot\n";
1811-
# print MAF "merged.vep.assembly = GRCh37\n";
1812-
# print MAF "EOF\n";
1813-
# print MERGE "else\n";
1814-
print MAF " ".$run_script_path."vaf_filter_v1.3.pl \${RUNDIR} $minvaf $mincov_t $mincov_n $maxindsize\n";
1815-
#print MAF " ".$run_script_path."vaf_filter_michigan_washu.pl \${RUNDIR}\n";
1816-
#print MAF " ".$run_script_path."vaf_all_callers.pl \${RUNDIR}\n";
1811+
print MAF "rm \${F_log}\n";
1812+
1813+
print MAF "F_log_filtered=".$sample_full_path."/vep.merged.withmutect.filtered.log"."\n";
1814+
print MAF "cat > \${RUNDIR}/vep.merged.withmutect.filtered.input <<EOF\n";
1815+
print MAF "merged.vep.vcf = ./merged.filtered.withmutect.vcf\n";
1816+
print MAF "merged.vep.output = ./merged.VEP.withmutect.filtered.vcf\n";
1817+
print MAF "merged.vep.vep_cmd = $vepannot\n";
1818+
print MAF "merged.vep.cachedir = $vepcache\n";
1819+
print MAF "merged.vep.reffasta = $f_ref_annot\n";
1820+
print MAF "merged.vep.assembly = GRCh38\n";
1821+
print MAF "EOF\n";
1822+
print MAF "rm \${F_log_filtered}\n";
1823+
1824+
### vep and vcf2maf annotation for all variants to get the annotated gene name for each variant ##
18171825
print MAF "cd \${RUNDIR}\n";
18181826
print MAF ". $script_dir/set_envvars\n";
1819-
print MAF " ".$run_script_path."vep_annotator.pl ./vep.merged.withmutect.input >&./vep.merged.withmutect.log\n";
1827+
print MAF " ".$run_script_path."vep_annotator.pl ./vep.merged.withmutect.input >&./vep.merged.withmutect.log\n";
18201828
print MAF "rm \${F_VCF_2}\n";
18211829
print MAF "rm \${F_VEP_2}\n";
18221830
print MAF "ln -s \${F_VCF_1} \${F_VCF_2}\n";
18231831
print MAF "ln -s \${F_VEP_1} \${F_VEP_2}\n";
1824-
# print MAF "cd
1825-
# print MAF " ".$run_script_path."vcf2maf.pl --input-vcf \${F_VCF_2} --output-maf \${F_maf} --tumor-id $sample_name\_T --normal-id $sample_name\_N --ref-fasta $f_ref_annot --filter-vcf $f_exac --file-tsl $TSL_DB\n";
1826-
print MAF " ".$run_script_path."vcf2maf.pl --input-vcf \${F_VCF_2} --output-maf \${F_maf} --tumor-id $sample_name\_T --normal-id $sample_name\_N --ref-fasta $f_ref_annot --file-tsl $TSL_DB\n";
1832+
print MAF " ".$run_script_path."vcf2maf.pl --input-vcf \${F_VCF_2} --output-maf \${F_maf} --tumor-id $sample_name\_T --normal-id $sample_name\_N --ref-fasta $f_ref_annot --file-tsl $TSL_DB\n";
1833+
1834+
## do the filtering for variants and ignore tumor vaf > 0.05 for gene in smg ##
1835+
print MAF " ".$run_script_path."vaf_filter_v1.4.pl \${RUNDIR} $sample_name $minvaf $mincov_t $mincov_n $maxindsize $db_smg\n";
1836+
1837+
print MAF " ".$run_script_path."vep_annotator.pl ./vep.merged.withmutect.filtered.input >&./vep.merged.withmutect.filtered.log\n";
1838+
print MAF "rm \${F_VCF_2_filtered}\n";
1839+
print MAF "rm \${F_VEP_2_filtered}\n";
1840+
print MAF "ln -s \${F_VCF_1_filtered} \${F_VCF_2_filtered}\n";
1841+
print MAF "ln -s \${F_VEP_1_filtered} \${F_VEP_2_filtered}\n";
1842+
print MAF " ".$run_script_path."vcf2maf.pl --input-vcf \${F_VCF_2_filtered} --output-maf \${F_maf_filtered} --tumor-id $sample_name\_T --normal-id $sample_name\_N --ref-fasta $f_ref_annot --file-tsl $TSL_DB\n";
18271843
#print MAF " ".$run_script_path."splice_site_check.pl $sample_full_path\n";
18281844
close MAF;
18291845

0 commit comments

Comments
 (0)