Commit c9ab2fce authored by Motazedi's avatar Motazedi

PopPoly method for diploid & polyploid haplotyping in F1 populations

parents
#!/usr/bin/env perl
# A simple perl wrapper for PopPoly to be able to specify the contig name and maximum genotype missing rate
# for haplotyping.
# Useful when multiple contigs are present in the bam and VCF files, which causes error for haplotyping.
# This wrapper extracts the desired contig from the bam and VCF files, extracts SNPs and breaks down complex varinats
# in the VCF file, filters out SNPs based on their genotype missing rate and runs PopPoly with the passed input options.
# This script should be stored in the same folder as PopPoly main.py and break_vcf.sh.
# Written by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.
# Last modified: 15-08-2018
use strict;
use warnings;
use File::Temp qw/ tempfile tempdir /;
use Cwd 'abs_path';
use Scalar::Util qw(looks_like_number);
sub signal_handler{
die "\nCaught a signal: $!\n";
}
my @PATH = split /\//, abs_path($0);
my $PopPoly_Path = join '/', @PATH[0..$#PATH-1];
my $hlp_msg = "\nA simple perl wrapper for PopPoly, so that the contig name and the maximum genotype missing rate\nof each SNP can be specified for haplotyping. Useful when multiple contigs are present in the BAM\nand VCF files, which are not handled directly by PopPoly. Also, useful to exclude SNPs from phasings\nbased upon their genotype missing rate in the VCF file.\n\nThis wrapper extracts the desired contig from the bam and VCF files, performs SNP filtering and runs PopPoly\nwith the passed input options. To run this script it should be stored in the same folder as PopPoly main.py.\n\nInput parameters:\n\n\tparameters passed to PopPoly (see ./main.py -h)\n\n\t-h, --help\tprint this help message and exit.\n\t-c, --contig\tname of the contig to be phased.\n\t--maxMISS\tmaximum tolerable genotype missing rate for each SNP to be included in phasing.\n\nWritten by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.";
$SIG{INT} = \&signal_handler;
$SIG{TERM} = \&signal_handler;
my %options_set;
my @to_remove;
my @iftrue_opts = ("t","top","skip","filter","impute","redose");
my @toget_opts = ("c","contig","a", "aggressive", "e", "error", "r", "rho", "k", "kappa", "w", "warmup", "v", "verror", "P1", "P2", "mmq", "mbq", "maxIS", "maxMISS");
my $n=-1;
my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX => ".bam"); # tmp file to store contig reads if contig is specified
close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2);
my ($fh3, $contig_vcf_broken) = tempfile(TEMPLATE => "tmp_vcf_brokenXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh3);
while ($n < $#ARGV){ #separate positional & optional arguments
$n+=1;
if (grep {/^-/} $ARGV[$n]){
(my $option= $ARGV[$n]) =~ s/^-+//;
if ($option eq "h" or $option eq "help"){
print $hlp_msg."\n";
exit 0;
}
if (grep {$_ eq $option} @iftrue_opts){
$options_set{$option} = "";
push @to_remove, $n;
} elsif (grep {$_ eq $option} @toget_opts){
if ($n == $#ARGV) {die "ERROR: No value passed to $ARGV[$n]!\n"}
if (grep {/^-/} $ARGV[$n+1]) {die "ERROR: Invalid value $ARGV[$n+1] passed to $ARGV[$n]!\n"}
$options_set{$option} = $ARGV[$n+1];
push @to_remove, ($n,$n+1);
$n+=1
} else {die "ERROR: Invalid option $ARGV[$n]!\n"}
}
}
for (reverse @to_remove){ # remove optional arguments (now saved in %options_set) from the input to get positional arguments
splice @ARGV, $_, 1
}
if ($#ARGV == -1){ # check correct number of positional arguments
die "ERROR: No bam file, VCF file and output name is given!\n"
} elsif ($#ARGV < 1){
die "ERROR: No VCF file and output name is given!\n"
} elsif ($#ARGV < 2){
die "ERROR: No output name is given!\n"
} elsif ($#ARGV > 2){
die "ERROR: Too many positional arguments given!\n"
}
for (my $i=0; $i<14; $i+=2){ # check doubly given value getting options
if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}
for (my $i=0; $i<2; $i+=2){ # check doubly given set true options
if (exists $options_set{$iftrue_opts[$i]} and exists $options_set{$iftrue_opts[$i+1]}) {die "ERROR: Only one of -$iftrue_opts[$i] and --$iftrue_opts[$i+1] allowed!\n"}
}
if (exists $options_set{maxMISS}){ # check the given SNP missing rate
if (not looks_like_number($options_set{maxMISS})){
die "ERROR: the SNP missing rate must be a number!\n"
} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
} else {
$options_set{maxMISS}+=1e-06
}
} else {
print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
$options_set{maxMISS}=2
}
if (exists $options_set{contig} or exists $options_set{c}){ # check if contig name is given or not
if (not exists $options_set{contig}){
$options_set{contig}=$options_set{c};
delete $options_set{c}
}
}
(-f $ARGV[0]) or die "ERROR: could not find the bam file ${ARGV[0]}!\n";
(-f $ARGV[1]) or die "ERROR: could not find the VCF file ${ARGV[1]}!\n";
if (exists $options_set{contig}){ # extract contig reads from the original BAM fmile
if (! -f "$ARGV[0].bai"){
print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
my $indx=system("samtools index $ARGV[0]");
$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
}
my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
$ARGV[0] = $contig_bam;
} else {
print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
}
if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
my $ctg_found = 0;
open($fh2,">",$contig_vcf); # extract contig variants from the original VCF file
open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
SEARCH: {
while (my $line = <$fh_vcf>){
chomp $line;
if (grep {/^#/} $line){
printf $fh2 "%s\n", $line;
} else {
my $vcf_ctg = (split /\t/, $line)[0];
if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
$ctg_found+=1;
my $missing = 0;
if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate
my @genos = split /\t/, $line;
my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
$missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
}
if ($missing < $options_set{maxMISS}){
printf $fh2 "%s\n", $line
}
} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
last SEARCH;
}
}
}
}
close($fh_vcf);
close($fh2);
}
my $break_vcf = !system("${PopPoly_Path}/break_vcf.sh ${contig_vcf} ${contig_vcf_broken}"); # break complex variants and throw out indels using break_vcf.py
$break_vcf or die "Couldn't break the complex variants and throw out indels from the VCF file: $!\n";
$ARGV[1] = $contig_vcf_broken;
if (exists $options_set{contig}) {delete $options_set{contig}} # not passed to PopPoly main
if (exists $options_set{maxMISS}) {delete $options_set{maxMISS}} # not passed to PopPoly main
my %new_options_set;
for (keys %options_set){
if (length($_)==1){
$new_options_set{"-$_"}=$options_set{$_}
} else {
$new_options_set{"--$_"}=$options_set{$_}
}
}
undef %options_set;
my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ", map { $new_options_set{$_} eq "" ? "$_" : "$_ $new_options_set{$_}" } (keys %new_options_set)));
!$phasing or die "ERROR: failed to run PopPoly: $!\n";
END {
unlink "${contig_bam}" if (-f "${contig_bam}");
unlink "${contig_vcf}" if (-f "${contig_vcf}");
unlink "${contig_vcf_broken}" if (-f "${contig_vcf_broken}");
}
PopPoly, version 2.2, Ehsan Motazedi (ehsan.motazedi@gmail.com), last modified: Aug 15 2018
Introduction
============
**PopPoly** \(executable *main.py*\) is a tool for haplotype estimation in polyploid F1 populations. PopPoly uses all of the population reads to estimate the maximum likelihood parental haplotypes, from which offspring haplotypes are selected using the MEC criterion.
The algorithm is based on a Bayesian framework to choose the most likely parental haplotypes using sequence reads, and applies Mendelian inheritance rules, assuming no recombination between the SNPs, to weight each candidate phasing for the parents and to later choose the offspring phasings from the parental haplotypes. It can also re-assign the dosages for all or missing SNPs using the Mendelian rules and sequence reads. The name of the parents in the F1 population must be provided as input parameter, with the possibility to specify just one parent in self-fertilizing populations. Otherwise the first two samples registered in the BAM header will be counted as parents. Also, the VCF file must contain only SNPs, that is to say no complex variants and indels. This can be achieved by running the *break_vcf.sh* script on the original VCF file.
PopPoly is written in *Python* and is compatible with both Python 2 and Python 3. There is also a perl wrapper available *PopPoly_wrapped.pl* to directly use input files that contain multiple contigs, in which case one has to specify the name of the desired contig for haplotyping through the wrapper option. The wrapper also allows to filter SNPs based upon their genotype missing rate in the population and can be run using the original VCF file as it calls break_vcf.sh internally (see PopPoly_wrapped.pl -h).
It is also possible to exclude some of the individuals from analysis, for example if several replicates of the same sample are present under different names and one wishes to use only one of them. The space separated names of the to be excluded individuals must be stord in the shell environment in a variable called *exclude* before running PopPoly, as follows:
export exclude="sample_1_replciate_2 sample_2_replicate_1 sample_3_replicate_2 sample_3_replicate_3"
Note that PopPoly relies on *pysam* and *numpy*, which must have been properly installed before PopPoly is called. Some users have reported problems with *numpy undefined symbol*, a discussion of which can be found [here] (https://stackoverflow.com/questions/36190757/numpy-undefined-symbol-pyfpe-jbuf).
Input parameters:
=====================
For the input parameters, see the help of the software:
./main.py --help, -h
***Example:***
./main.py bamfile vcffile output_dirname --kappa 0.85 -a 0.8 --rho 0.2 --impute
Citation:
=====================
To cite PopPoly, please refer to *Family-based haplotype estimation and allele dosage correction for polyploids using short sequence reads*, Ehsan Motazedi, Chris Maliepaard, Richard Finkers and Dick de Ridder, 2018, bioRxiv 318196; *doi: https://doi.org/10.1101/318196*
Download:
=====================
The software is available at gitlab, Wageningen UR:
git clone git@git.wageningenur.nl:motaz001/PopPoly.git —recursive
Copyright and License:
=====================
Copyright (c) 2017, Ehsan Motazedi, Wageningen UR.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files \(the "Software"\), to deal in the Software without restriction, including without limitation the rights to use, copy, share and modify the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. This permission is only valid for academic and personal use, i.e. not for commercial purposes.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Make the SNP-fragment list from a multi-sample bam file and its corresponding VCF file with input options: mmq, mbq, maxIS and qoffset.
# Written by Ehsan Motazedi, Wageningen UR, 04-11-2016.
# Last mofified: 22-02-2018.
import copy
import pysam
import re
import subprocess
from genotypes import getAllelesPop, getGenotypesPop
from math import log
from reads import Read
def adjust_seq(read):
""" Adjust the read sequence according to the mapped positions, i.e. get read of the insertions and clipped bases."""
cig = list(_x for _x in re.split('[0-9]{1,}', read.cigarstring) if _x)
cign = list(_x for _x in re.split('[^0-9]', read.cigarstring) if _x)
cig = list(cig[_n]*int(cign[_n]) for _n in range(0, len(cig)))
cig = ''.join(_x for _x in cig if 'D' not in _x) # deleted nucleotides from the reference are not present in the read sequence
adj_seq = []
adj_qual = []
for _n, _x in enumerate(read.seq):
if cig[_n]=='M': # Allow only match/mismatch of the read nucleotides, i.e. no clipping, no insertion.
adj_qual.append(read.qual[_n])
adj_seq.append(_x)
return ''.join(adj_seq), adj_qual
def frag_gen(varpos, allelelst, genolst, coordinates, nucleotides, qscores):
""" Generate SNP-fragments from (paired-end) reads."""
if (not coordinates) or (not nucleotides):
return Read(), {}
var_codes = []
var_num = []
var_q = []
for _n, _x in enumerate(varpos):
if _x < coordinates[0]:
continue
if _x > coordinates[-1]:
break
if _x in coordinates:
if set(genolst[_n].GetGenes()).intersection(set(['.','-'])): # Throw away missing genotypes or genotypes with one or more missing allele(s)
continue
if len(set(genolst[_n].GetGenes()))<2: # Do not include in the SNP-fragments belonging to a population member its homozygous alleles
continue
try:
var_codes.append(allelelst[_n][2][nucleotides[coordinates.index(_x)]])
var_num.append(allelelst[_n][0])
var_q.append(qscores[coordinates.index(_x)])
except KeyError: # if the called nucleotide is wrong, i.e. does not exist in VCF alleles
pass
try: # return the reads {SNP number: allele} and the associated quality scores {SNP number: Qscore}
return Read({_x:str(_y) for _x, _y in zip([var_num[0], var_num[1]]+var_num[2:], var_codes)}), {_x:str(_y) for _x, _y in zip(var_num, var_q)}
except IndexError: # throw away reads with less than 2 SNPs
return Read(), {}
class InputError(Exception):
""" Handle invalid input specifications."""
def __init__(self, msg, *args):
super(InputError, self).__init__(args)
self.msg = msg
def __str__(self):
return "InputError: {}\n".format(self.msg)
def __repr__(self):
return (self.msg,)+self.args+('\n',)
def get_frags(bamfile, vcffile, maxIS=3000, mbq=13, mmq=20, qoffset=33):
"""
mmq : minimum read mapping quality to consider a read for phasing, default 20\n
qvoffset <33/64> : quality value offset, 33/64 depending on how quality values were encoded, default is 33\n
mbq : minimum base quality to consider a base for haplotype fragment, default 13\n
maxIS : maximum insert-size for a paired-end read to be considered as a single fragment for phasing, default 3000.\n
"""
try:
all_reads = pysam.Samfile(bamfile, 'rb')
except IOError:
raise InputError('The input BAM file was not found!')
ReadHeader = subprocess.Popen(["samtools","view","-H", bamfile], shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
header, err_header = ReadHeader.communicate()
if ReadHeader.returncode!=0:
raise InputError('Failed to read the header from the bam file! Original error message:\n'+err_header)
if isinstance(header, bytes):
header=bytes.decode(header)
else:
pass
RGIDs, SMnames = [], []
for _headerline in header.splitlines(): # pasre the header of the bam file to extract the ID and SM fields of each Read Group
if _headerline[0:3]=='@RG':
RGID_added, SM_added = False, False
for _n, _field in enumerate(_headerline.split()):
if 'ID' in _field:
if not RGID_added:
RGIDs.append(''.join(_headerline.split()[_n].split(':')[1:])) # add read group ID
RGID_added = True
else:
raise InputError('Double ID fields detected in @RG header line!')
elif 'SM' in _field:
if not SM_added:
SMnames.append(''.join(_headerline.split()[_n].split(':')[1:])) # add the sample name associated with the read group ID
SM_added = True
else:
raise InputError('Double SM fields detected in @RG header line!')
if SM_added and RGID_added:
pass
elif SM_added:
raise InputError('ID field missing in @RG header line!')
elif RGID_added:
raise InputError('SM field missing in @RG header line!')
else:
raise InputError('ID and SM fields missing in @RG header line!')
if len(RGIDs)!=len(set(RGIDs)):
raise InputError('Duplicate read group IDs detected in the bam header!')
GroupedReadsWithID = [[] for _id in RGIDs] # separate reads belonging to each Read Group
for _read in all_reads:
GroupedReadsWithID[RGIDs.index(dict(_read.get_tags())['RG'])].append(_read)
GroupedReads, GroupedSM = [], [] # combine the reads with different RGID but the same SM as they are assumed to belong to the same sample
for _SMname, _ReadGroup in zip(SMnames, GroupedReadsWithID):
if _SMname not in GroupedSM:
GroupedReads.append(_ReadGroup)
GroupedSM.append(_SMname)
else:
GroupedReads[GroupedSM.index(_SMname)]+=_ReadGroup
del GroupedReadsWithID
try:
genolst = getGenotypesPop(vcffile, GroupedSM)
allelelst = getAllelesPop(vcffile, GroupedSM)
except IOError:
raise InputError('The VCF file was not found!')
except:
raise
Frag_lsts, Q_lsts = [], []
varpos = list(_x[1] for _x in allelelst)
for _group, reads in enumerate(GroupedReads):
frag_lst, q_lst = [], []
reads = sorted(reads, key= lambda _x: (_x.qname, _x.flag & 0x900)) # sort the alignments using their names, with the primary alignments being placed first.
_rnum = 0
rNUM = len(reads)
while _rnum < rNUM: # scan through the alignments to find the pairs/singles
break_mate = False
is_proper_pair = False
read = copy.deepcopy(reads[_rnum])
if read.is_unmapped or read.is_duplicate or (read.flag & 0x900): # throw away unmapped reads, duplicates and secondary/supplementary alignments
_rnum+=1
continue
try:
if read.qname == reads[_rnum+1].qname: # means the read is paired to a mate or has multiple/supplemenatry alignments
if reads[_rnum+1].is_unmapped or reads[_rnum+1].is_duplicate or (reads[_rnum+1].flag & 0x900): # if the next read is unmapped, a duplicate or not primarym: skip it
pass
else:
is_proper_pair = True # means the read is paired to a proper mate
mate = copy.deepcopy(reads[_rnum+1])
_rnum+=2
else: # means the read is single
_rnum+=1
except IndexError: # could occur for the last read in the alignments' list
_rnum+=1
if is_proper_pair:
if (max(mate.positions+read.positions)-min(mate.positions+read.positions)+1)>maxIS: # Check the maximum insert-size to consider the mates as a single fragment
break_mate = True
if read.mapping_quality >= mmq:
try:
adj_seq, adj_qual = adjust_seq(read)
coordinates, nucleotides, quals = list(zip(*[(int(_x), _y, _z) for _x, _y, _z in zip(read.positions, adj_seq.upper(), list(ord(_x)-qoffset for _x in adj_qual)) if _z>=mbq]))
except ValueError as e:
if e.args[0][0:len("need more than 0 values to unpack")]=="need more than 0 values to unpack" or e.args[0][0:len("not enough values to unpack")]=="not enough values to unpack":
coordinates, nucleotides, quals= [(), (), ()]
else:
raise
else:
coordinates, nucleotides, quals = [(), (), ()]
if mate.mapping_quality >= mmq:
try:
adj_seq, adj_qual = adjust_seq(mate)
coordinates_mate, nucleotides_mate, quals_mate = list(zip(*[(int(_x), _y, _z) for _x, _y, _z in zip(mate.positions, adj_seq.upper(), list(ord(_x)-qoffset for _x in adj_qual)) if _z>=mbq]))
except ValueError as e:
if e.args[0][0:len("need more than 0 values to unpack")]=="need more than 0 values to unpack" or e.args[0][0:len("not enough values to unpack")]=="not enough values to unpack":
coordinates_mate, nucleotides_mate, quals_mate = [(), (), ()]
else:
raise
else:
coordinates_mate, nucleotides_mate, quals_mate = [(), (), ()]
if break_mate:
pass
else: # merge the sub-reads if the insert-size is less than maxIS
try:
coordinates, nucleotides, quals = list(zip(*sorted(zip(coordinates+coordinates_mate, nucleotides + nucleotides_mate, quals+quals_mate), key = lambda x: x[0])))
except ValueError as e:
if e.args[0][0:len("need more than 0 values to unpack")]=="need more than 0 values to unpack" or e.args[0][0:len("not enough values to unpack")]=="not enough values to unpack":
coordinates, nucleotides, quals = [(), (), ()]
else:
raise
else:
break_mate = True
if read.mapping_quality >= mmq:
try:
adj_seq, adj_qual = adjust_seq(read)
coordinates, nucleotides, quals = list(zip(*[(int(_x), _y, _z) for _x, _y, _z in zip(read.positions, adj_seq.upper(), list(ord(_x)-qoffset for _x in adj_qual)) if _z>=mbq]))
except ValueError as e:
if e.args[0][0:len("need more than 0 values to unpack")]=="need more than 0 values to unpack" or e.args[0][0:len("not enough values to unpack")]=="not enough values to unpack":
coordinates, nucleotides, quals = [(), (), ()]
else:
raise
else:
coordinates, nucleotides, quals = [(), (), ()]
coordinates_mate, nucleotides_mate, quals_mate = [(), (), ()]
if break_mate:
pass
else:
unique_q = []
unique_c = []
unique_n = []
for _n, _c in enumerate(coordinates): # remove the duplicates from overlapping positions
try:
if unique_c[-1]!=_c:
#print("\nN:\n",nucleotides, "\nQ:\n", quals)
unique_c.append(_c)
unique_n.append(nucleotides[_n])
unique_q.append(quals[_n])
elif unique_n[-1]==nucleotides[_n]:
unique_q[-1] = min(126-qoffset, unique_q[-1]+quals[_n])
else: # if the called nucleotides differ at overlapping sites, use the one with the highest Phred score and adjust the Phred score.
if quals[_n]>unique_q[-1]:
_new_q_score = round(-10*log(1-10**(-unique_q[-1]/10)*(1-10**(-quals[_n]/10)), 10), 5) # Q=-10log(p,10)
if _new_q_score >= mbq:
unique_n[-1] = nucleotides[_n]
unique_q[-1] = _new_q_score
else:
del(unique_c[-1], unique_n[-1], unique_q[-1])
else:
_new_q_score = round(-10*log(1-(1-10**(-unique_q[-1]/10))*10**(-quals[_n]/10), 10), 5)
if _new_q_score >= mbq:
unique_q[-1] = _new_q_score
else:
del(unique_c[-1], unique_n[-1], unique_q[-1])
except IndexError:
unique_c.append(_c)
unique_n.append(nucleotides[_n])
unique_q.append(quals[_n])
coordinates, nucleotides, quals = [unique_c, unique_n, unique_q]
coordinates = list(_x+1 for _x in coordinates) # Convert the zero-based BAM coordinates to 1-based, as the coordinates are 1-based in the VCF (like the SAM format).
new_frag, new_q = frag_gen(varpos, allelelst, genolst[_group], coordinates, nucleotides, quals)
frag_lst.append(new_frag)
q_lst.append(new_q)
if break_mate:
coordinates_mate = list(_x+1 for _x in coordinates_mate)
new_frag_mate, new_q_mate = frag_gen(varpos, allelelst, genolst[_group], coordinates_mate, nucleotides_mate, quals_mate)
frag_lst.append(new_frag_mate)
q_lst.append(new_q_mate)
try:
frag_lst, q_lst = [_lst for _lst in zip(*[(_x, _y) for _x, _y in zip(frag_lst, q_lst) if not _x.isNULL()])]
except ValueError as e:
if e.args[0][0:len("need more than 0 values to unpack")]=="need more than 0 values to unpack" or e.args[0][0:len("not enough values to unpack")]=="not enough values to unpack":
frag_lst, q_lst = [], []
Frag_lsts.append(frag_lst)
Q_lsts.append(q_lst)
return Frag_lsts, Q_lsts, GroupedSM
File added
# Writen by Ehsan Motazedi, Wageningen UR, 09-11-2016.
# Last Updated: 14-10-2017
import itertools
import multiprocessing as mpthread
import numpy.random as nprnd
import random as rnd
import sys
import threading
from collections import Counter
from genotypes import Genotype, getAllelesPop
#from hapcompare import getVER
from haplotypes import Haplotypes, getMEC
from logprob import diffVec, Hamming
from logprob2 import veclog as log, loge, GetlogProb
from math import sqrt, exp, factorial
from numpy import array, divide, isnan, vectorize, delete as npdel, argwhere as npwhere
from reads import remove_last_SNP, SubReads
#Global variableis for the module to store all parental transmissions to the child
transmit_parent_m = None # possible transmissions of ploidy_m//2 alleles from mother
transmit_parent_f = None # possible transmissions of ploidy_f//2 alleles from father
child_haploid = None # possible haploid genomes of the child
transmit_parent = None # possible transmissions of alleles to the child from the parents
MAXCORES = mpthread.cpu_count()-1 # Max number of available cores
NCORES = 6 # desired number of cores to use
AllGenos_m = None # All possible genoptypes for mother at each position
AllGenos_f = None # All possible genoptypes for father at each position
Global_Sema = None # Global Semaphore not initialized yet
def GetSemaLock(useglobal=True):
""" Return the semaphore and lock objects needed for multi threading."""
global NCORES
global MAXCORES
global Global_Sema
if min(NCORES, MAXCORES)>1:
if useglobal: # use a global semaphore
if Global_Sema is None:
Global_Sema = mpthread.BoundedSemaphore(min(NCORES, MAXCORES))
else: # use a local semaphore at each call to BranchPop
sema = mpthread.BoundedSemaphore(min(NCORES, MAXCORES))
lock = mpthread.Lock()
Parallel = True
else: # If parallel execution is not possible (as just one core could be used), concurrent execution is performed using threads.
if useglobal:
if Global_Sema is None:
Global_Sema = threading.BoundedSemaphore(NCORES)
else:
sema = threading.BoundedSemaphore(NCORES)
lock = threading.Lock()
Parallel = False
if useglobal:
return Global_Sema, lock , Parallel
else:
return sema, lock , Parallel
def thread_func(sema, lock, q, func, *args):
""" thread to call func with args and put the return value in q. """
_locked = False
try:
b = func(*args)
_locked = lock.acquire()
if func==Branch:
for _x in b:
q.append(_x)
else:
for _i, _b in enumerate(b):
for _x in _b:
q[_i].append(_x)
except:
raise
finally:
if _locked:
lock.release()
sema.release()
def SetGenos(vcffile, SampleNames, ploidy_m, ploidy_f):
""" Set AllGenos_m, AllGenos_f, AllGenos_c from the vcffile. Supposed to be called by main.py only once."""
global AllGenos_m
global AllGenos_f
AllGenos = getAllelesPop(vcffile, SampleNames)
AllGenos_m = {_Alleles[0]: list(itertools.product(*(_Alleles[2].values() for _h in range(0, ploidy_m)))) for _Alleles in AllGenos}
AllGenos_f = {_Alleles[0]: list(itertools.product(*(_Alleles[2].values() for _h in range(0, ploidy_f)))) for _Alleles in AllGenos}
for AllGenos in (AllGenos_m, AllGenos_f): # Throw away duplicate genotypes at each position from each parent's possibility list
for _key in AllGenos.keys():
_value = AllGenos[_key]
_new_value = []
_current = 0
while _current<len(_value):
if Counter(_value[_current]) not in [Counter(_x) for _x in _new_value]:
_new_value.append(_value[_current])
_current+=1
AllGenos[_key]=_new_value
def reduce(function, iterable, initializer=None):
it = iter(iterable)
if initializer is None:
try:
initializer = next(it)
except StopIteration:
raise TypeError('reduce() of empty sequence with no initial value')
accum_value = initializer
for x in it:
accum_value = function(accum_value, x)
return accum_value
def remove_last_SNP_vec(SNPvec):
return map(remove_last_SNP, SNPvec)
def GetLogProbH(H):
""" Determine the number of occurences of each unique homologue in H, and hence calculate P[H|eps] to be
used in calculating RLs, according to Berger et al. 2014 p. 4. Return log(P[H|eps]) in base 2."""
M=[]
Vset = []
for _v in H.GetVS():
if _v not in Vset:
Vset.append(_v)
M.append(1)
else:
M[Vset.index(_v)]+=1
log_denom = 0
k = sum(M) # the ploidy level
for _M in M:
log_denom += loge(factorial(_M))
return loge(factorial(k))-log_denom-H.GetlogCS()
class BlockException(Exception):
def __init__(self, value):
super(BlockException, self).__init__(value)
self.args = (value,)
def __str__(self):
return "{}".format(':'.join(str(_arg) for _arg in self.args))
def __repr__(self):
return self.args
def Branch(H, G, SReads, rho, error, qscores = None, usecounts=True, samplename=None, geno=True):
""" Branch current haplotype H at position G.s, using G, the semi-reads for G.s and the thresholds rho (hard)
and kappa (soft). Variant error rate, i.e. error, is passed to calculate the branching probabilities. (Berger et al. 2014 p. 5)."""
if set(G.GetGenes())==set(['.']) and geno: # If the genotype is missing, extension will be skipped!
garbage = sys.stderr.write('WARNING: {1:s}\'s genotype is missing at position {2:d}! Phasing extension will be escaped at s={0:d} for {1:s}!\n'.format(G.GetS()+1, samplename if samplename else '', G.GetPos()))
return [H+Haplotypes(G.GetS(), G.GetS(), 0, 0, None, None, *['-' for _homologue in H.GetVS()])] # skip extension if no extension has been possible
if len(set(G.GetGenes()))<2 and geno: # If the genotype is homozygous, the phasing will be trivially determined
garbage = sys.stderr.write('WARNING: {1:s}\'s genotype is homozygous at position {2:d}! Its phasing extension will be trivial at s={0:d}!\n'.format(G.GetS()+1, samplename if samplename else '', G.GetPos()))
return [H+Haplotypes(G.GetS(), G.GetS(), 0, 0, None, None, *G.GetGenes())]
if all(r.isNULL() for r in SReads):
#raise BlockException('No semi-reads exist at SNP position {0:d}{1:s}!\n'.format(G.GetS()+1, " for "+samplename if samplename else ''))
garbage = sys.stderr.write('WARNING: No semi-reads exist at SNP position {2:d}, s={0:d}{1:s}!\n'.format(G.GetS()+1, " for "+samplename if samplename else '', G.GetPos()))
myrho = loge(rho) # change rho to log_2 scale
ProbH = H.GetRL()
count_threshold = int((1./len(H.GetVS())*len([_SRead for _SRead in SReads if not _SRead.isNULL()])-2*sqrt(1./len(H.GetVS())*(1-1./len(H.GetVS()))*len([_SRead for _SRead in SReads if not _SRead.isNULL()]))))
#garbage = sys.stdout.write('Base Haplotype, S= {1:d}:\n\t{0}\n'.format('\n\t'.join(('\t'.join(_x for _x in H.GetGenotype(_pos))) for _pos in range(H.GetStart(), H.GetStop()+1)), G.GetS()))
extend_H_branched = []
extend_logprobs_branched = []
uniques, priors, logrprobs, mins = GetProbTot(H, G, SReads, error, True, qscores, usecounts)
if not uniques:
return [H+Haplotypes(G.GetS(), G.GetS(), 0, 0, None, None, *['-' for _homologue in H.GetVS()])] # skip extension if no extension has been possible
#drop_out = [_n for _n, _x in enumerate(mins) if _x < count_threshold]
drop_out = []
#if len(drop_out)==len(uniques):
# garbage = sys.stderr.write('WARNING: No extension survived the minimum read support criterion at SNP {0:d} for {1:s}! Extension will be skipped at {0:d} for {1:s}!\n'.format(G.GetS()+1, samplename))
# return [H+Haplotypes(G.GetS(), G.GetS(), 0, 0, None, None, *['-' for _homologue in H.GetVS()])] # skip extension if no homologue has the minimum number of supporting reads
#for _n in sorted(drop_out, reverse=True): # discard haplotypes that have homologues not supported by a minimum number of reads
# del uniques[_n]
# del priors[_n]
# del logrprobs[_n]
#garbage = sys.stderr.write("SNP: {}\n".format(G.GetS()))
#garbage = sys.stderr.write("\tTotal with last SNP REMOVED = {:7.19f}\n".format(total))
#garbage = sys.stderr.write("\tTotal WITH last SNP with EQUAL priors = {:7.19f}\n".format(sum(map(exp,logrprobs))/len(logrprobs)))
_norm = max(logrprobs)
logrprobs_adj = [_x + _y - _norm for _x, _y in zip(logrprobs, log(priors))] # adjust P[SR(s)|Hp, H, eps] by its prior P[Hp|H, eps]. Optionally, max(logrprobs) is subtracted to prevent underflow
#garbage = sys.stderr.write("\tTotal WITH last SNP with ACTUAL priors = {:7.19f}\n".format(sum(map(exp, logrprobs_adj))))
_norm = loge(sum(exp(_x) for _x in logrprobs_adj))
logHpprobs = [_x - _norm for _x in logrprobs_adj] # obtain p[Hp|SR(s), H, eps] by P[SR(s)|Hp, H, eps].P[Hp|H, eps]/P[SR(s)|H, eps]
for _n, Hcandid in enumerate(uniques): # remove duplicate extensions that occur due to presence of similar homologues in H
#garbage = sys.stderr.write('\tCandidate Extension:\n\t {0}\n'.format('\t'.join(str(_x) for _x in Hcandid.GetGenotype(Hcandid.GetStop()))))
#garbage = sys.stderr.write("\t prob={:7.19f}, logprob= {:7.19f}\n".format(exp(logHpprobs[_n]), logHpprobs[_n]))
if logHpprobs[_n] >= myrho: # cut the extensions with an adjusted reads-probability lower than the threshold
extend_H_branched.append(Hcandid)
extend_logprobs_branched.append(logHpprobs[_n])
#garbage = sys.stderr.write("\t Candidate Accepted!\n")
else:
#garbage = sys.stderr.write("\t Candidate Rejected by rho or kappa!\n")
pass
if not extend_H_branched:
garbage = sys.stderr.write('WARNING: No extension survived the threshold at SNP {0:d} for {1:s}!\n'.format(G.GetS()+1, samplename))
_maxindex = logHpprobs.index(max(logHpprobs))
extend_H_branched.append(uniques[_maxindex])
extend_H_branched[-1].SetRL(logHpprobs[_maxindex])
#return [H+Haplotypes(G.GetS(), G.GetS(), 0, 0, None, None, *['-' for _homologue in H.GetVS()])] # skip extension if no extension passes the branching threshold
for _H, _logprob in zip(extend_H_branched, extend_logprobs_branched): # Update the stored RL value of H during branching to\
_H.SetRL(_logprob) # Update the RL of Hp, as noted in Berger et al. 2014 p. 5.
return extend_H_branched
def BranchPop(Hpop, Gpop, SReadspop, rho, error, recombination_rate, QscoresPop=[None, None, None], GenoConstraint=True, useglobal=True):
""" Branch current pop haplotypes (Hm, Hf, Hc) at position s = Gm.s = Gf.s = Gc[1].S = ... = Gc[n].S, using the genotypes, the semi-reads of each pop member
active at s, the recombination rate at s and the threshold rho. Base error rate, i.e. error, or quality scores are passed to calculate the
branching probabilities using the reads, and recombination_rate is used to incorporate the recombination events in those probabilities."""
global AllGenos_m
global AllGenos_f
Hm, Hf = Hpop[0:2]
Hc = Hpop[2:] # Unpack the population candidate haplotypes to maternal, paternal and child haplotypes
Gm, Gf = Gpop[0:2] # Unpack the population candidate genotypes to maternal, paternal and child haplotypes
Gc = Gpop[2:]
SReadsm, SReadsf = SReadspop[0:2] # Unpack the reads to maternal, paternal and child haplotypes
SReadsc = SReadspop[2:]
qscoresm, qscoresf = QscoresPop[0:2] # Assign quality scores, if present, to parental and progeny reads
qscoresc = QscoresPop[2:]
qscoresc.extend([None for _i in range(0, len(Gc)-len(qscoresc))])
if all(r.isNULL() for r in SReadsm+SReadsf+reduce(lambda x, y: x+y, SReadsc)):
garbage = sys.stderr.write('WARNING: No semi-reads exist at SNP position {1:d}, s={0:d} with this pop!\n'.format(Gm.GetS()+1, Gm.GetPos()))