Commit 72acccde authored by Motazedi's avatar Motazedi

Improvement for large files

parent 7627a481
......@@ -34,8 +34,6 @@ my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX
close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2);
my ($fh3, $contig_sam) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".sam"); # tmp file to store contig reads in sam format if contig is specified
close($fh3);
while ($n < $#ARGV){ #separate positional & optional arguments
$n+=1;
......@@ -84,6 +82,8 @@ if (exists $options_set{maxMISS}){ # check the given SNP missing rate
die "ERROR: the SNP missing rate must be a number!\n"
} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
} else {
$options_set{maxMISS}+=1e-06
}
} else {
print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
......@@ -98,32 +98,42 @@ if (exists $options_set{contig} or exists $options_set{c}){ # check if contig na
}
if (exists $options_set{contig}){ # extract contig reads from the original BAM fmile
my $get_reads=system("samtools view -H $ARGV[0] >$contig_sam; samtools view $ARGV[0] |awk '{if (\$3==\"$options_set{contig}\"){print \$0}}' >>$contig_sam;samtools view -bh $contig_sam -o $contig_bam");
if (! -f "$ARGV[0].bai"){
print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
my $indx=system("samtools index $ARGV[0]");
$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
}
my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
print("\n");
$ARGV[0] = $contig_bam;
} else {
print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
}
if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
my $ctg_found = 0;
open($fh2,">",$contig_vcf); # extract contig variants from the original VCF file
open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
while (my $line = <$fh_vcf>){
chomp $line;
if (grep {/^#/} $line){
printf $fh2 "%s\n", $line;
} else {
my $vcf_ctg = (split /\t/, $line)[0];
if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
my $missing = 0;
if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate
my @genos = split /\t/, $line;
my @non_missing_genos = grep {/^([0-9](\/|\|))+$/} map {(split /:/, $_)[0]} @genos[9..$#genos];
my $missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9)
}
if ($missing < ($options_set{maxMISS}+1E-06)){
printf $fh2 "%s\n", $line
SEARCH: {
while (my $line = <$fh_vcf>){
chomp $line;
if (grep {/^#/} $line){
printf $fh2 "%s\n", $line;
} else {
my $vcf_ctg = (split /\t/, $line)[0];
if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
$ctg_found+=1;
my $missing = 0;
if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate
my @genos = split /\t/, $line;
my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
$missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
}
if ($missing < $options_set{maxMISS}){
printf $fh2 "%s\n", $line
}
} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
last SEARCH;
}
}
}
......@@ -149,7 +159,6 @@ my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ",
!$phasing or die "ERROR: failed to run PopPoly: $!\n";
END {
unlink "${contig_sam}" if (-f "${contig_sam}");
unlink "${contig_bam}" if (-f "${contig_bam}");
unlink "${contig_vcf}" if (-f "${contig_vcf}");
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment