PopPoly_wrapped.pl 8.06 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
#!/usr/bin/env perl
# A simple perl wrapper for PopPoly to be able to specify the contig name and maximum genotype missing rate
# for haplotyping.
# Useful when multiple contigs are present in the bam and VCF files, which causes error for haplotyping.
# This wrapper extracts the desired contig from the bam and VCF files, extracts SNPs and breaks down complex varinats
# in the VCF file, filters out SNPs based on their genotype missing rate and runs PopPoly with the passed input options.
# This script should be stored in the same folder as PopPoly main.py and break_vcf.sh.
# Written by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.
# Last modified: 15-08-2018 

use strict;
use warnings;
use File::Temp qw/ tempfile tempdir /;
use Cwd 'abs_path';
use Scalar::Util qw(looks_like_number);

sub signal_handler{
    die "\nCaught a signal: $!\n";
}

my @PATH = split /\//, abs_path($0);
my $PopPoly_Path = join '/', @PATH[0..$#PATH-1];
my $hlp_msg = "\nA simple perl wrapper for PopPoly, so that the contig name and the maximum genotype missing rate\nof each SNP can be specified for haplotyping. Useful when multiple contigs are present in the BAM\nand VCF files, which are not handled directly by PopPoly. Also, useful to exclude SNPs from phasings\nbased upon their genotype missing rate in the VCF file.\n\nThis wrapper extracts the desired contig from the bam and VCF files, performs SNP filtering and runs PopPoly\nwith the passed input options. To run this script it should be stored in the same folder as PopPoly main.py.\n\nInput parameters:\n\n\tparameters passed to PopPoly (see ./main.py -h)\n\n\t-h, --help\tprint this help message and exit.\n\t-c, --contig\tname of the contig to be phased.\n\t--maxMISS\tmaximum tolerable genotype missing rate for each SNP to be included in phasing.\n\nWritten by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.";

$SIG{INT}  = \&signal_handler;
$SIG{TERM} = \&signal_handler;

my %options_set;
my @to_remove;
my @iftrue_opts = ("t","top","skip","filter","impute","redose");
my @toget_opts = ("c","contig","a", "aggressive", "e", "error", "r", "rho", "k", "kappa", "w", "warmup", "v", "verror", "P1", "P2", "mmq", "mbq", "maxIS", "maxMISS");
my $n=-1;

my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX => ".bam"); # tmp file to store contig reads if contig is specified
close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2);
my ($fh3, $contig_vcf_broken) = tempfile(TEMPLATE => "tmp_vcf_brokenXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh3);

while ($n < $#ARGV){ #separate positional & optional arguments
	$n+=1;
	if (grep {/^-/} $ARGV[$n]){
		(my $option= $ARGV[$n]) =~ s/^-+//;
		if ($option eq "h" or $option eq "help"){
			print $hlp_msg."\n";
			exit 0;
		}
		if (grep {$_ eq $option} @iftrue_opts){
			$options_set{$option} = "";
			push @to_remove, $n;
		} elsif (grep {$_ eq $option} @toget_opts){
			if ($n == $#ARGV) {die "ERROR: No value passed to $ARGV[$n]!\n"}
			if (grep {/^-/} $ARGV[$n+1]) {die "ERROR: Invalid value $ARGV[$n+1] passed to $ARGV[$n]!\n"}
			$options_set{$option} = $ARGV[$n+1];
			push @to_remove, ($n,$n+1);
			$n+=1
		} else {die "ERROR: Invalid option $ARGV[$n]!\n"}
	}
}

for (reverse @to_remove){ # remove optional arguments (now saved in %options_set) from the input to get positional arguments
	splice @ARGV, $_, 1
}

if ($#ARGV == -1){ # check correct number of positional arguments
	die "ERROR: No bam file, VCF file and output name is given!\n"
} elsif ($#ARGV < 1){
	die "ERROR: No VCF file and output name is given!\n"
} elsif ($#ARGV < 2){
	die "ERROR: No output name is given!\n"
} elsif ($#ARGV > 2){
	die "ERROR: Too many positional arguments given!\n"
}

for (my $i=0; $i<14; $i+=2){ # check doubly given value getting options
	if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}
for (my $i=0; $i<2; $i+=2){ # check doubly given set true options 
	if (exists $options_set{$iftrue_opts[$i]} and exists $options_set{$iftrue_opts[$i+1]}) {die "ERROR: Only one of -$iftrue_opts[$i] and --$iftrue_opts[$i+1] allowed!\n"}
}

if (exists $options_set{maxMISS}){ # check the given SNP missing rate
	if (not looks_like_number($options_set{maxMISS})){
		die "ERROR: the SNP missing rate must be a number!\n"
	} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
		die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
	} else {
		$options_set{maxMISS}+=1e-06
	}
} else {
	print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
	$options_set{maxMISS}=2
}

if (exists $options_set{contig} or exists $options_set{c}){ # check if contig name is given or not
	if (not exists $options_set{contig}){
		$options_set{contig}=$options_set{c};
		delete $options_set{c}
	}
}

(-f $ARGV[0]) or die "ERROR: could not find the bam file ${ARGV[0]}!\n";
(-f $ARGV[1]) or die "ERROR: could not find the VCF file ${ARGV[1]}!\n";

if (exists $options_set{contig}){  # extract contig reads from the original BAM fmile 
	if (! -f "$ARGV[0].bai"){
		print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
		my $indx=system("samtools index $ARGV[0]");
		$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
	}
	my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
	!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
	$ARGV[0] = $contig_bam;
} else {
	print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
}

if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
	my $ctg_found = 0; 
	open($fh2,">",$contig_vcf);  # extract contig variants from the original VCF file 
	open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
	SEARCH: {
		while (my $line = <$fh_vcf>){ 
			chomp $line;
			if (grep {/^#/} $line){
				printf $fh2 "%s\n", $line;
			} else {
				my $vcf_ctg = (split /\t/, $line)[0];
				if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
					$ctg_found+=1;
					my $missing = 0;
					if ($options_set{maxMISS}<2) {  # filter SNPs using maxMISS rate
						my @genos = split /\t/, $line;
						my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
						$missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
					}
					if ($missing < $options_set{maxMISS}){
						printf $fh2 "%s\n", $line
					}
				} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
					last SEARCH;
				}
			}
		}
	}
	close($fh_vcf);
	close($fh2);
}

my $break_vcf = !system("${PopPoly_Path}/break_vcf.sh ${contig_vcf} ${contig_vcf_broken}"); # break complex variants and throw out indels using break_vcf.py
$break_vcf or die "Couldn't break the complex variants and throw out indels from the VCF file: $!\n";
$ARGV[1] = $contig_vcf_broken;

if (exists $options_set{contig}) {delete $options_set{contig}} # not passed to PopPoly main
if (exists $options_set{maxMISS}) {delete $options_set{maxMISS}} # not passed to PopPoly main
	
my %new_options_set;
for (keys %options_set){
	if (length($_)==1){
		$new_options_set{"-$_"}=$options_set{$_}
	} else {
		$new_options_set{"--$_"}=$options_set{$_}
	}
}
undef %options_set;
my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ", map { $new_options_set{$_} eq "" ? "$_" : "$_ $new_options_set{$_}" } (keys %new_options_set))); 
!$phasing or die "ERROR: failed to run PopPoly: $!\n";

END {
    unlink "${contig_bam}" if (-f "${contig_bam}");
    unlink "${contig_vcf}" if (-f "${contig_vcf}");
    unlink "${contig_vcf_broken}" if (-f "${contig_vcf_broken}");
}