Commit 72acccde authored by Motazedi's avatar Motazedi

Improvement for large files

parent 7627a481
...@@ -34,8 +34,6 @@ my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX ...@@ -34,8 +34,6 @@ my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX
close($fh1); close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2); close($fh2);
my ($fh3, $contig_sam) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".sam"); # tmp file to store contig reads in sam format if contig is specified
close($fh3);
while ($n < $#ARGV){ #separate positional & optional arguments while ($n < $#ARGV){ #separate positional & optional arguments
$n+=1; $n+=1;
...@@ -84,6 +82,8 @@ if (exists $options_set{maxMISS}){ # check the given SNP missing rate ...@@ -84,6 +82,8 @@ if (exists $options_set{maxMISS}){ # check the given SNP missing rate
die "ERROR: the SNP missing rate must be a number!\n" die "ERROR: the SNP missing rate must be a number!\n"
} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) { } elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n" die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
} else {
$options_set{maxMISS}+=1e-06
} }
} else { } else {
print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n"; print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
...@@ -98,32 +98,42 @@ if (exists $options_set{contig} or exists $options_set{c}){ # check if contig na ...@@ -98,32 +98,42 @@ if (exists $options_set{contig} or exists $options_set{c}){ # check if contig na
} }
if (exists $options_set{contig}){ # extract contig reads from the original BAM fmile if (exists $options_set{contig}){ # extract contig reads from the original BAM fmile
my $get_reads=system("samtools view -H $ARGV[0] >$contig_sam; samtools view $ARGV[0] |awk '{if (\$3==\"$options_set{contig}\"){print \$0}}' >>$contig_sam;samtools view -bh $contig_sam -o $contig_bam"); if (! -f "$ARGV[0].bai"){
print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
my $indx=system("samtools index $ARGV[0]");
$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
}
my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n"; !$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
print("\n");
$ARGV[0] = $contig_bam; $ARGV[0] = $contig_bam;
} else { } else {
print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n" print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
} }
if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
my $ctg_found = 0;
open($fh2,">",$contig_vcf); # extract contig variants from the original VCF file open($fh2,">",$contig_vcf); # extract contig variants from the original VCF file
open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n"; open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
while (my $line = <$fh_vcf>){ SEARCH: {
chomp $line; while (my $line = <$fh_vcf>){
if (grep {/^#/} $line){ chomp $line;
printf $fh2 "%s\n", $line; if (grep {/^#/} $line){
} else { printf $fh2 "%s\n", $line;
my $vcf_ctg = (split /\t/, $line)[0]; } else {
if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) { my $vcf_ctg = (split /\t/, $line)[0];
my $missing = 0; if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate $ctg_found+=1;
my @genos = split /\t/, $line; my $missing = 0;
my @non_missing_genos = grep {/^([0-9](\/|\|))+$/} map {(split /:/, $_)[0]} @genos[9..$#genos]; if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate
my $missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9) my @genos = split /\t/, $line;
} my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
if ($missing < ($options_set{maxMISS}+1E-06)){ $missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
printf $fh2 "%s\n", $line }
if ($missing < $options_set{maxMISS}){
printf $fh2 "%s\n", $line
}
} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
last SEARCH;
} }
} }
} }
...@@ -149,7 +159,6 @@ my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ", ...@@ -149,7 +159,6 @@ my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ",
!$phasing or die "ERROR: failed to run PopPoly: $!\n"; !$phasing or die "ERROR: failed to run PopPoly: $!\n";
END { END {
unlink "${contig_sam}" if (-f "${contig_sam}");
unlink "${contig_bam}" if (-f "${contig_bam}"); unlink "${contig_bam}" if (-f "${contig_bam}");
unlink "${contig_vcf}" if (-f "${contig_vcf}"); unlink "${contig_vcf}" if (-f "${contig_vcf}");
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment