PopPoly_wrapped.pl 7.26 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
#!/usr/bin/env perl
# A simple perl wrapper for PopPoly to be able to specify the contig name and maximum genotype missing rate
# for haplotyping.
# Useful when multiple contigs are present in the bam and VCF files, which causes error for haplotyping.
# This wrapper extracts the desired contig from the bam and VCF files and runs PopPoly with the passed input options.
# This script should be stored in the same folder as PopPoly main.py.
# Written by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.
# Last modified: 20-07-2018 

use strict;
use warnings;
use File::Temp qw/ tempfile tempdir /;
use Cwd 'abs_path';
use Scalar::Util qw(looks_like_number);

sub signal_handler{
    die "\nCaught a signal: $!\n";
}

my @PATH = split /\//, abs_path($0);
my $PopPoly_Path = join '/', @PATH[0..$#PATH-1];
my $hlp_msg = "\nA simple perl wrapper for PopPoly, so that the contig name and the maximum genotype missing rate\nof each SNP can be specified for haplotyping. Useful when multiple contigs are present in the BAM\nand VCF files, which are not handled directly by PopPoly. Also, useful to exclude SNPs from phasings\nbased upon their genotype missing rate in the VCF file.\n\nThis wrapper extracts the desired contig from the bam and VCF files, performs SNP filtering and runs PopPoly\nwith the passed input options. To run this script it should be stored in the same folder as PopPoly main.py.\n\nInput parameters:\n\n\tparameters passed to PopPoly (see ./main.py -h)\n\n\t-h, --help\tprint this help message and exit.\n\t-c, --contig\tname of the contig to be phased.\n\t--maxMISS\tmaximum tolerable genotype missing rate for each SNP to be included in phasing.\n\nWritten by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.";

$SIG{INT}  = \&signal_handler;
$SIG{TERM} = \&signal_handler;

my %options_set;
my @to_remove;
my @iftrue_opts = ("t","top","skip","filter","impute","redose");
my @toget_opts = ("c","contig","a", "aggressive", "e", "error", "r", "rho", "k", "kappa", "w", "warmup", "v", "verror","mmq","mbq","maxIS","maxMISS");
my $n=-1;

my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX => ".bam"); # tmp file to store contig reads if contig is specified
close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2);

while ($n < $#ARGV){ #separate positional & optional arguments
	$n+=1;
	if (grep {/^-/} $ARGV[$n]){
		(my $option= $ARGV[$n]) =~ s/^-+//;
		if ($option eq "h" or $option eq "help"){
			print $hlp_msg."\n";
			exit 0;
		}
		if (grep {$_ eq $option} @iftrue_opts){
			$options_set{$option} = "";
			push @to_remove, $n;
		} elsif (grep {$_ eq $option} @toget_opts){
			if ($n == $#ARGV) {die "ERROR: No value passed to $ARGV[$n]!\n"}
			if (grep {/^-/} $ARGV[$n+1]) {die "ERROR: Invalid value $ARGV[$n+1] passed to $ARGV[$n]!\n"}
			$options_set{$option} = $ARGV[$n+1];
			push @to_remove, ($n,$n+1);
			$n+=1
		} else {die "ERROR: Invalid option $ARGV[$n]!\n"}
	}
}

for (reverse @to_remove){ # remove optional arguments (now saved in %options_set) from the input to get positional arguments
	splice @ARGV, $_, 1
}

if ($#ARGV == -1){ # check correct number of positional arguments
	die "ERROR: No bam file, VCF file and output name is given!\n"
} elsif ($#ARGV < 1){
	die "ERROR: No VCF file and output name is given!\n"
} elsif ($#ARGV < 2){
	die "ERROR: No output name is given!\n"
} elsif ($#ARGV > 2){
	die "ERROR: Too many positional arguments given!\n"
}

for (my $i=0; $i<14; $i+=2){ # check doubly given value getting options
	if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}
for (my $i=0; $i<2; $i+=2){ # check doubly given set true options 
	if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}

if (exists $options_set{maxMISS}){ # check the given SNP missing rate
	if (not looks_like_number($options_set{maxMISS})){
		die "ERROR: the SNP missing rate must be a number!\n"
	} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
		die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
Motazedi's avatar
Motazedi committed
85 86
	} else {
		$options_set{maxMISS}+=1e-06
87 88 89 90 91 92 93 94 95 96 97 98 99 100
	}
} else {
	print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
	$options_set{maxMISS}=2
}

if (exists $options_set{contig} or exists $options_set{c}){ # check if contig name is given or not
	if (not exists $options_set{contig}){
		$options_set{contig}=$options_set{c};
		delete $options_set{c}
	}
}

if (exists $options_set{contig}){  # extract contig reads from the original BAM fmile 
Motazedi's avatar
Motazedi committed
101 102 103 104 105 106
	if (! -f "$ARGV[0].bai"){
		print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
		my $indx=system("samtools index $ARGV[0]");
		$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
	}
	my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
107 108 109 110 111 112 113
	!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
	$ARGV[0] = $contig_bam;
} else {
	print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
}

if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
Motazedi's avatar
Motazedi committed
114
	my $ctg_found = 0; 
115 116
	open($fh2,">",$contig_vcf);  # extract contig variants from the original VCF file 
	open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
Motazedi's avatar
Motazedi committed
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
	SEARCH: {
		while (my $line = <$fh_vcf>){ 
			chomp $line;
			if (grep {/^#/} $line){
				printf $fh2 "%s\n", $line;
			} else {
				my $vcf_ctg = (split /\t/, $line)[0];
				if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
					$ctg_found+=1;
					my $missing = 0;
					if ($options_set{maxMISS}<2) {  # filter SNPs using maxMISS rate
						my @genos = split /\t/, $line;
						my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
						$missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
					}
					if ($missing < $options_set{maxMISS}){
						printf $fh2 "%s\n", $line
					}
				} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
					last SEARCH;
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
				}
			}
		}
	}
	close($fh_vcf);
	close($fh2);
	$ARGV[1] = $contig_vcf;
}

if (exists $options_set{contig}) {delete $options_set{contig}} # not passed to PopPoly main
if (exists $options_set{maxMISS}) {delete $options_set{maxMISS}} # not passed to PopPoly main
	
my %new_options_set;
for (keys %options_set){
	if (length($_)==1){
		$new_options_set{"-$_"}=$options_set{$_}
	} else {
		$new_options_set{"--$_"}=$options_set{$_}
	}
}
undef %options_set;
my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ", map { $new_options_set{$_} eq "" ? "$_" : "$_ $new_options_set{$_}" } (keys %new_options_set))); 
!$phasing or die "ERROR: failed to run PopPoly: $!\n";

END {
    unlink "${contig_bam}" if (-f "${contig_bam}");
    unlink "${contig_vcf}" if (-f "${contig_vcf}");
}