Commit 516ce286 authored by Motazedi's avatar Motazedi

PopPoly method for diploid & polyploid haplotyping in F1 populations

parents
#!/usr/bin/env perl
# A simple perl wrapper for PopPoly to be able to specify the contig name and maximum genotype missing rate
# for haplotyping.
# Useful when multiple contigs are present in the bam and VCF files, which causes error for haplotyping.
# This wrapper extracts the desired contig from the bam and VCF files and runs PopPoly with the passed input options.
# This script should be stored in the same folder as PopPoly main.py.
# Written by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.
# Last modified: 31-07-2018
use strict;
use warnings;
use File::Temp qw/ tempfile tempdir /;
use Cwd 'abs_path';
use Scalar::Util qw(looks_like_number);
sub signal_handler{
die "\nCaught a signal: $!\n";
}
my @PATH = split /\//, abs_path($0);
my $PopPoly_Path = join '/', @PATH[0..$#PATH-1];
my $hlp_msg = "\nA simple perl wrapper for PopPoly, so that the contig name and the maximum genotype missing rate\nof each SNP can be specified for haplotyping. Useful when multiple contigs are present in the BAM\nand VCF files, which are not handled directly by PopPoly. Also, useful to exclude SNPs from phasings\nbased upon their genotype missing rate in the VCF file.\n\nThis wrapper extracts the desired contig from the bam and VCF files, performs SNP filtering and runs PopPoly\nwith the passed input options. To run this script it should be stored in the same folder as PopPoly main.py.\n\nInput parameters:\n\n\tparameters passed to PopPoly (see ./main.py -h)\n\n\t-h, --help\tprint this help message and exit.\n\t-c, --contig\tname of the contig to be phased.\n\t--maxMISS\tmaximum tolerable genotype missing rate for each SNP to be included in phasing.\n\nWritten by Ehsan Motazedi, Wageningen University & Research, 20-07-2018.";
$SIG{INT} = \&signal_handler;
$SIG{TERM} = \&signal_handler;
my %options_set;
my @to_remove;
my @iftrue_opts = ("t","top","skip","filter","impute","redose");
my @toget_opts = ("c","contig","a", "aggressive", "e", "error", "r", "rho", "k", "kappa", "w", "warmup", "v", "verror", "P1", "P2", "mmq", "mbq", "maxIS", "maxMISS");
my $n=-1;
my ($fh1, $contig_bam) = tempfile(TEMPLATE => "tmp_bamXXXX", DIR=>"/tmp", SUFFIX => ".bam"); # tmp file to store contig reads if contig is specified
close($fh1);
my ($fh2, $contig_vcf) = tempfile(TEMPLATE => "tmp_vcfXXXX", DIR=>"/tmp", SUFFIX => ".vcf"); # tmp file to store contig variants if contig is specified
close($fh2);
while ($n < $#ARGV){ #separate positional & optional arguments
$n+=1;
if (grep {/^-/} $ARGV[$n]){
(my $option= $ARGV[$n]) =~ s/^-+//;
if ($option eq "h" or $option eq "help"){
print $hlp_msg."\n";
exit 0;
}
if (grep {$_ eq $option} @iftrue_opts){
$options_set{$option} = "";
push @to_remove, $n;
} elsif (grep {$_ eq $option} @toget_opts){
if ($n == $#ARGV) {die "ERROR: No value passed to $ARGV[$n]!\n"}
if (grep {/^-/} $ARGV[$n+1]) {die "ERROR: Invalid value $ARGV[$n+1] passed to $ARGV[$n]!\n"}
$options_set{$option} = $ARGV[$n+1];
push @to_remove, ($n,$n+1);
$n+=1
} else {die "ERROR: Invalid option $ARGV[$n]!\n"}
}
}
for (reverse @to_remove){ # remove optional arguments (now saved in %options_set) from the input to get positional arguments
splice @ARGV, $_, 1
}
if ($#ARGV == -1){ # check correct number of positional arguments
die "ERROR: No bam file, VCF file and output name is given!\n"
} elsif ($#ARGV < 1){
die "ERROR: No VCF file and output name is given!\n"
} elsif ($#ARGV < 2){
die "ERROR: No output name is given!\n"
} elsif ($#ARGV > 2){
die "ERROR: Too many positional arguments given!\n"
}
for (my $i=0; $i<14; $i+=2){ # check doubly given value getting options
if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}
for (my $i=0; $i<2; $i+=2){ # check doubly given set true options
if (exists $options_set{$toget_opts[$i]} and exists $options_set{$toget_opts[$i+1]}) {die "ERROR: Only one of -$toget_opts[$i] and --$toget_opts[$i+1] allowed!\n"}
}
if (exists $options_set{maxMISS}){ # check the given SNP missing rate
if (not looks_like_number($options_set{maxMISS})){
die "ERROR: the SNP missing rate must be a number!\n"
} elsif ($options_set{maxMISS}>1 or $options_set{maxMISS}<0) {
die "ERROR: the SNP missing rate must be between 0 and 1 (0<=maxMISS<=1)!\n"
} else {
$options_set{maxMISS}+=1e-06
}
} else {
print STDERR "WARNING: No maximum missing rate is specified and hence no filtering of SNPs will be performed!\n";
$options_set{maxMISS}=2
}
if (exists $options_set{contig} or exists $options_set{c}){ # check if contig name is given or not
if (not exists $options_set{contig}){
$options_set{contig}=$options_set{c};
delete $options_set{c}
}
}
(-f $ARGV[0]) or die "ERROR: could not find the bam file ${ARGV[0]}!\n";
(-f $ARGV[1]) or die "ERROR: could not find the VCF file ${ARGV[1]}!\n";
if (exists $options_set{contig}){ # extract contig reads from the original BAM fmile
if (! -f "$ARGV[0].bai"){
print STDERR "WARNING: Cannot find the index of the input bam file: $ARGV[0].bai. samtools index is being run...\n";
my $indx=system("samtools index $ARGV[0]");
$indx ? die "ERROR: Indexing failed!\n$!\n" : print STDERR "Succeffully indexed the bam file...\n"
}
my $get_reads = system("samtools idxstats $ARGV[0]|grep -m 1 $options_set{contig} |awk '{print \"\\\"\"\$1\":0-\"\$2\"\\\"\"}'|xargs samtools view -bh $ARGV[0] > $contig_bam");
!$get_reads or die "Cannot extract $options_set{contig} reads from $ARGV[0]: $!\n";
$ARGV[0] = $contig_bam;
} else {
print STDERR "WARNING: No contig name has been specified! The BAM and VCF files must contain only one contig!\n"
}
if ($options_set{maxMISS}<2 or exists $options_set{contig}) { # filter VCF file if contig and/or maxMISS are given
my $ctg_found = 0;
open($fh2,">",$contig_vcf); # extract contig variants from the original VCF file
open(my $fh_vcf, "<", $ARGV[1]) or die "Cannot open $ARGV[1]: $!\n";
SEARCH: {
while (my $line = <$fh_vcf>){
chomp $line;
if (grep {/^#/} $line){
printf $fh2 "%s\n", $line;
} else {
my $vcf_ctg = (split /\t/, $line)[0];
if (!(exists $options_set{contig}) or $vcf_ctg eq $options_set{contig}) {
$ctg_found+=1;
my $missing = 0;
if ($options_set{maxMISS}<2) { # filter SNPs using maxMISS rate
my @genos = split /\t/, $line;
my @non_missing_genos = grep {/([0-9](\/|\|))+/} map {(split /:/, $_)[0]} @genos[9..$#genos];
$missing = ($#{genos}-9-$#{non_missing_genos})/($#{genos}+1-9);
}
if ($missing < $options_set{maxMISS}){
printf $fh2 "%s\n", $line
}
} elsif ($ctg_found>0) { # avoid reading the rest of the VCF file after finding the desired contig
last SEARCH;
}
}
}
}
close($fh_vcf);
close($fh2);
$ARGV[1] = $contig_vcf;
}
if (exists $options_set{contig}) {delete $options_set{contig}} # not passed to PopPoly main
if (exists $options_set{maxMISS}) {delete $options_set{maxMISS}} # not passed to PopPoly main
my %new_options_set;
for (keys %options_set){
if (length($_)==1){
$new_options_set{"-$_"}=$options_set{$_}
} else {
$new_options_set{"--$_"}=$options_set{$_}
}
}
undef %options_set;
my $phasing = system((join " ","${PopPoly_Path}/main.py", @ARGV)." ".(join " ", map { $new_options_set{$_} eq "" ? "$_" : "$_ $new_options_set{$_}" } (keys %new_options_set)));
!$phasing or die "ERROR: failed to run PopPoly: $!\n";
END {
unlink "${contig_bam}" if (-f "${contig_bam}");
unlink "${contig_vcf}" if (-f "${contig_vcf}");
}
PopPoly, version 2.2, Ehsan Motazedi (ehsan.motazedi@gmail.com), last modified: July 31 2018
Introduction
============
**PopPoly** \(executable *main.py*\) is a tool for haplotype estimation in polyploid F1 populations. PopPoly uses all of the population reads to estimate the maximum likelihood parental haplotypes, from which offspring haplotypes are selected using the MEC criterion.
The algorithm is based on a Bayesian framework to choose the most likely parental haplotypes using sequence reads, and applies Mendelian inheritance rules, assuming no recombination between the SNPs, to weight each candidate phasing for the parents and to later choose the offspring phasings from the parental haplotypes. It can also re-assign the dosages for all or missing SNPs using the Mendelian rules and sequence reads. The name of the parents in the F1 population must be provided as input parameter, with the possibility to specify just one parent in self-fertilizing populations. Otherwise the first two samples registered in the BAM header will be counted as parents.
PopPoly is written in *Python* and is compatible with both Python 2 and Python 3. There is also a perl wrapper available *PopPoly_wrapped.pl* to directly use input files that contain multiple contigs, in which case one has to specify the name of the desired contig for haplotyping through the wrapper option. The wrapper also allows to filter SNPs based upon their genotype missing rate in the population (see PopPoly_wrapped.pl -h).
It is also possible to exclude some of the individuals from analysis, for example if several replicates of the same sample are present under different names and one wishes to use only one of them. The space separated names of the to be excluded individuals must be stord in the shell environment in a variable called *exclude* before running PopPoly, as follows:
export exclude="sample_1_replciate_2 sample_2_replicate_1 sample_3_replicate_2 sample_3_replicate_3"
Note that PopPoly relies on *pysam* and *numpy*, which must have been properly installed before PopPoly is called. Some users have reported problems with *numpy undefined symbol*, a discussion of which can be found [here] (https://stackoverflow.com/questions/36190757/numpy-undefined-symbol-pyfpe-jbuf).
Input parameters:
=====================
For the input parameters, see the help of the software:
./main.py --help, -h
***Example:***
./main.py bamfile vcffile output_dirname --kappa 0.85 -a 0.8 --rho 0.2 --impute
Citation:
=====================
To cite PopPoly, please refer to *Family-based haplotype estimation and allele dosage correction for polyploids using short sequence reads*, Ehsan Motazedi, Chris Maliepaard, Richard Finkers and Dick de Ridder, 2018, bioRxiv 318196; *doi: https://doi.org/10.1101/318196*
Download:
=====================
The software is available at gitlab, Wageningen UR:
git clone git@git.wageningenur.nl:motaz001/PopPoly.git —recursive
Copyright and License:
=====================
Copyright (c) 2017, Ehsan Motazedi, Wageningen UR.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files \(the "Software"\), to deal in the Software without restriction, including without limitation the rights to use, copy, share and modify the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. This permission is only valid for academic and personal use, i.e. not for commercial purposes.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
This diff is collapsed.
File added
This diff is collapsed.
File added
This diff is collapsed.
# Written by Ehsan Motazedi, Wageningen UR, 05-02-2017.
# Last Updated: 01-09-2017
import math
import sys
cpdef int _xor(x, y):
return 0 if str(x)==str(y) else 1
cpdef Hamming(a, b, null_lst = [".", "-"]):
""" Hamming distance and similarity between two strings or lists or tuples.
(Hamming, Richard W., Error detecting and error correcting codes,
Bell System technical journal 29.2 (1950), pp. 147-160)."""
if isinstance(a, int):
a = [a]
if isinstance(b, int):
b = [b]
if len(a) != len(b):
raise ValueError('The length of the two inputs must be equal for comparison!')
if isinstance(a, str):
a = list(a)
if isinstance(b, str):
b = list(b)
hamvec = [_xor(x, y) for x, y in zip(a,b) if (str(x) not in null_lst) and (str(y) not in null_lst)]
hamgelijk = hamvec.count(0)
hamafstand = hamvec.count(1)
return hamafstand, hamgelijk
cpdef diffVec(a, b, null_lst = [".","-"]):
""" Obtain differences between two lists. Return two vectors: one specifying positions of similarity and one positions of difference."""
if isinstance(a, int):
a = [a]
if isinstance(b, int):
b = [b]
if len(a) != len(b):
raise ValueError('The length of the two inputs must be equal for comparison!')
if isinstance(a, str):
a = list(a)
if isinstance(b, str):
b = list(b)
ongelijk, gelijk = [], []
for n, xy in enumerate(zip(a,b)):
if (str(xy[0]) in null_lst) or (str(xy[1]) in null_lst):
continue
if _xor(*xy)==1:
ongelijk.append(n)
else:
gelijk.append(n)
return ongelijk, gelijk
File added
This diff is collapsed.
# Written by Ehsan Motazedi, Wageningen UR, 06-02-2017.
# Last Updated: 21-02-2017
import sys
from logprob import diffVec, Hamming
from haplotypes import Haplotypes
from libc.math cimport log, exp
from reads import Read
cdef extern from "math.h":
float INFINITY
cpdef double loge(double x):
""" Return log x in base e."""
try:
return log(x)
except ValueError as e:
garbage = sys.stderr.write("WARNING: {0}\n".format(e.args[0]))
return -1*INFINITY
def veclog(vec):
""" Logarithm function defined on vector space."""
try:
return map(loge, vec)
except TypeError:
return loge(vec)
def GetlogProb(r, Vset, eps, qual = None, counts = False, min_r_length=2):
""" Return the log probability of a read conditional on a presumed haplotype and
variant error rate, i.e. P(r|Vset, eps) (Berger et al. 2014, p. 4). If counts if True,
assign the read to the most compatible homologue (count 1 for that homologue and zero of
the others) to later calculate the number of reads compatible with each homologue for a set
of reads by the upstream functions."""
ar = r.GetalleleSet(Vset.GetStart(), Vset.GetStop())
if len([_allele for _allele in ar if _allele not in set(['-','.'])])<min_r_length:
if counts:
return [0, [0 for _h in range(0, len(Vset.GetVS()))]]
return 0 # Corresponding to P[r|Vset, eps] = 1
sr='-'*(r.GetBegin()-Vset.GetStart())+ar+'-'*(Vset.GetStop()-r.GetEnd()) # Make the lengths equal to calculate the distance
#warn, veclogprobs = False, []
warn, probs = False, []
if qual:
try:
er = []
escores = []
for _pos in r.GetPos():
escores.append(10**(-float(qual[_pos])/10))
_n = 0
for _x in ar:
if _x=='-':
er.append('-')
else:
er.append(escores[_n])
_n+=1
er1 = []
er2 = []
for _x in range(0, r.GetBegin()-Vset.GetStart()):
er1.append('-')
for _x in range(0, Vset.GetStop()-r.GetEnd()):
er2.append('-')
er = er1 + er
er = er + er2
except ValueError as e:
e.args=(e.args[0]+"\nInvalid quality scores detected in the alignment file!",)+e.args[1:]
raise
except KeyError as e:
warn = True
qual = None
#garbage=sys.stderr.write("******\nER:\n "+str(er)+"\n")
#garbage=sys.stderr.write("\nSR:\n "+str(sr)+"\n")
if not qual:
for _homolo in Vset.GetVS(): # _homolo corresponds to v in Vset in Berger et al. 2014 p. 4
D, A = Hamming(sr, _homolo) # D corresponds to D(r,v) and A correspond to A(r,v) in Berger et al. 2014 p. 4
#veclogprobs.append(A*veclog((1-eps)/(1-2./3*eps))+D*veclog(eps/3.)) # in case of error, any of the three other bases could have been called. So the probability for a specific erroneous base-call is eps./3. As the probability of no error is 1-eps, we have to normalize by 1-eps+eps/3 to get probs for dichotomy no-error/specific erroneous base-call.
probs.append(((1-eps)/(1-2./3*eps))**A*(eps/3.)**D) # in case of error, any of the three other bases could have been called. So the probability for a specific erroneous base-call is eps./3. As the probability of no error is 1-eps, we have to normalize by 1-eps+eps/3 to get probs for dichotomy no-error/specific erroneous base-call.
else: # Use eps(s) as a function of SNP position in the read instead of a fixed eps.
for _homolo in Vset.GetVS():
D, A = diffVec(sr, _homolo)
#garbage=sys.stderr.write("\nD:\n "+str(D)+"\n")
#garbage=sys.stderr.write("\nA:\n "+str(A)+"\n")
#_veclogprob = 0
_prob = 1
for _s in A:
#_veclogprob+=loge((1-er[_s])/(1-2./3*er[_s]))
_prob*=(1-er[_s])/(1-2./3*er[_s])
for _s in D:
#_veclogprob+=loge(er[_s]/3.)
_prob*=er[_s]/3.
#veclogprobs.append(_veclogprob)
probs.append(_prob)
#garbage=sys.stderr.write("******\n")
if warn:
garbage = sys.stderr.write("WARNING: Quality scores specified for none or only some positions of the read! The fixed given or deafult error rate was used for all the positions!\n")
unique_homolos = set(Vset.GetVS())
dict_homolos = {_dh:1./len([_h for _h in Vset.GetVS() if _h == _dh]) for _dh in unique_homolos} # the counts are set equally to 1/j for j similar homologues
h_counts=[0 for _h in Vset.GetVS()]
for _k, _h in enumerate(Vset.GetVS()):
#if _h==Vset.GetVS()[veclogprobs.index(min(veclogprobs))]:
if _h==Vset.GetVS()[probs.index(min(probs))]:
h_counts[_k]=dict_homolos[_h]
if counts:
#print([loge(sum(probs)/float(len(Vset.GetVS()))), h_counts])# corresponds to log(P[r|Vset, eps]) = log(1/k * sum(P(r|v in Vset))) in Berger et al. 2014 p. 4, assign the read to the most compatible homologue if counts of the reads assigned to each homologue are needed.
return [loge(sum(probs)/float(len(Vset.GetVS()))), h_counts] # corresponds to log(P[r|Vset, eps]) = log(1/k * sum(P(r|v in Vset))) in Berger et al. 2014 p. 4, assign the read to the most compatible homologue if counts of the reads assigned to each homologue are needed.
#return [loge(sum(map(exp,veclogprobs)))-loge(len(Vset.GetVS())), h_counts] # corresponds to log(P[r|Vset, eps]) = log(1/k * sum(P(r|v in Vset))) in Berger et al. 2014 p. 4, assign the read to the most compatible homologue if counts of the reads assigned to each homologue are needed.
return loge(sum(probs)/float(len(Vset.GetVS()))) # corresponds to log(P[r|Vset, eps]) = log(1/k * sum(P(r|v in Vset))) in Berger et al. 2014 p. 4
#return veclog(sum(map(exp,veclogprobs)))-veclog(len(Vset.GetVS())) # corresponds to veclog(P[r|Vset, eps]) = veclog(1/k * sum(P(r|v in Vset))) in Berger et al. 2014 p. 4
File added
This diff is collapsed.
# Written by Ehsan Motazedi, Wageningen UR, 27-07-2016.
# Last Updated: 22-12-2017
import ast
import networkx as nx
def remove_last_SNP(r):
""" Remove the last SNP from a read."""
d = r.GetDict() #ast.literal_eval(repr(r))
try:
del d[max(d.keys())]
except ValueError:
return Read(d)
return Read(d)
def Extract_Components(fragmentlist):
""" Make the connected Read_Graphs from the list of Read objects, so that each connected Read_Graph is phased separately."""
Read_Graph = nx.Graph() # The SNP-fragment graph to be grown from the input fragment file
_fragdict, _edges, _ls = (dict(), [], []) # Dictionary and lists to add each fragment to the SNP-fragment graph
for _frag in fragmentlist: # Add the fragments to the graph nodes, and add an edge between every two variants that lie on the same fragment
_fragdict = _frag.GetDict() #dict(ast.literal_eval(repr(_frag))) # Convert each fragment Read object to a conventional dictionary
Read_Graph.add_nodes_from(_fragdict.keys()) # Adding the fragments as nodes to the graph
_ls = list((x,y) for x in _fragdict.keys() for y in _fragdict.keys()) # Considering an edge between each two variants within the same fragment
_edges = list(edge for edge in _ls if edge[0]!=edge[1]) # Adding the edges to the graph, avoiding self-loops
Read_Graph.add_edges_from(_edges)
_edges, _ls, _fragdict = (None, None, None)
return sorted(nx.connected_components(Read_Graph), key=lambda x: min(x))
def getReads(fragfile, type=0):
"""Extract the reads from a HapTree fragment file containing a dictionary for each (paired-end) fragment (type=0),\n
as produced by 'fragmentpoly.py' or a HapCut (Bansal & Bafina 2008) format file containing a tab-delimited row for each\n
fragment (type=1). Return the reads as Read objects in a list."""
Rout=[]
with open(fragfile, 'rU') as ffile:
if type==0:
for _strdict in ffile:
_dict = ast.literal_eval(_strdict)
if len(_dict)>1:
Rout.append(Read(_dict)) # Add the read is it has more than two variants
else:
for _rec in ffile:
_reclst = _rec.split()
del _reclst[1], _reclst[-1] # get rid of the fragment name and quality scores
if len(''.join(_reclst[_x] for _x in range(2, len(_reclst), 2)))>1:
Rout.append(Read([int(_reclst[_x])-1 for _x in range(1, len(_reclst)-1, 2)],# convert 1, 2, ... numbering in HapCut to 0, 1, ...
[_reclst[_x] for _x in range(2, len(_reclst), 2)]))
return Rout
class Read(object):
def __init__(self, *args):
""" Two ways to construct a read object:\n 1) Give a fragment dictionary as input.\n 2) Give a vector input:\n
[start positions (single int or list/tuple: first part, second part(maybe None)),\n
alleles (single string or list/tuple: first part, second part (maybe None))].\nNo input corresponds to an empty dictionary."""
if len(args)==0:
args = [dict()]
if len(args)==1:
if not isinstance(args[0], dict):
raise TypeError("The single input must be a dictionary!")
rdict = args[0]
self.pos = sorted((int(_x) for _x in rdict.keys()), key=lambda p: int(p))
self.alleles = []
else:
self.start, self.alleles = args
if (not isinstance(self.start, int)) and (not all(isinstance(_start, int) for _start in self.start)):
raise TypeError("The given start positions of the (sub)reads must be integers!")
self.pos = []
if isinstance(self.start, int):
self.start = [self.start,]
if isinstance(self.alleles, str):
self.alleles = [self.alleles,]
for _n, _start in enumerate(self.start):
self.pos+=range(_start, _start+len(self.alleles[_n]))
rdict = dict(zip(self.pos, list(''.join(self.alleles))))
self.alleles=[]
try:
self.start = self.pos[0]
except IndexError: # This exception occurs if the position list, self.pos, is empty
self.start = 0
try:
self.end = self.pos[-1]
except IndexError:
self.end = -1
for _pos in range(self.start, self.end+1):
if _pos==self.pos[0]:
self.alleles.append(rdict[_pos])
del(self.pos[0])
else:
self.alleles.append('-')
self.pos = list(range(self.start, self.end+1))
self.alleles = self.alleles
def __eq__(self, other):
return isinstance(other, Read) and self.alleles == other.allele and self.pos == other.pos
def __repr__(self):
rdict = dict()
for _pos, _allele in zip(self.pos, self.alleles):
if _allele!='-':
rdict[_pos] = _allele
return str(rdict)
def __str__(self):
if self.pos==[]:
return 'None'
return "begin={0}, end={1}, alleles={2}".format(self.start, self.end, ''.join(str(_x) for _x in self.alleles))
def __len__(self):
return len([_allele for _allele in self.alleles if _allele not in set(['.','-'])])
def isNULL(self):
return self.GetDict()=={}
#return ast.literal_eval(repr(self))=={}
def GetalleleSet(self, start, stop):
if self.pos==[]:
return ''
return ''.join(str(_x) for _x in self.alleles[min(self.end-self.start+1, max(0,start-self.start)):max(0, min(stop+1-self.start, self.end-self.start+1))])
def GetBegin(self):
if self.pos==[]:
return 'None'
return self.start
def GetEnd(self):
if self.pos==[]:
return 'None'
return self.end
def GetDict(self):
rdict = dict()
for _pos, _allele in zip(self.pos, self.alleles):
if _allele!='-':
rdict[_pos] = _allele
return rdict
def GetPos(self):
return sorted(_pos for _pos in (self.GetDict()).keys())
#return sorted(_pos for _pos in dict(ast.literal_eval(repr(self))).keys())
def SemiRead(R, s):
""" Make a semi-read relative to the SNP position 's' from a read 'R' (Berger et al. 2014 p. 4)."""
if not isinstance(R, Read):
raise TypeError("The argument to SemiRead must be a Read object!")
d = R.GetDict()
if s not in d.keys():
return Read(dict())
rdict = dict()
for _s, _allele in d.iteritems():
if _s<s+1:
rdict[_s]=str(_allele)
if len(rdict)<2:
return Read(dict())
else:
return Read(rdict)
def SemiRead_old(R, s):
""" Make a semi-read relative to the SNP position 's' from a read 'R' (Berger et al. 2014 p. 4)."""
if not isinstance(R, Read):
raise TypeError("The argument to SemiRead must be a Read object!")
if s<R.GetBegin()+1 or R.GetPos()==[]:
return Read(dict())
rdict = dict()
for _n, _allele in enumerate(R.GetalleleSet(R.start, s)):
if _allele!='-':
rdict[R.start+_n]=_allele
if s not in rdict.keys() or len(rdict)<2:
return Read(dict())
else:
return Read(rdict)
def SubReads(Rset, Sset, min_subread_length=2):
"""Obtain the sub-reads of a read set, i.e. those reads used
for phasing of Sset and only those read positions relevant to phasing Sset
(Berger et al. 2014 p. 4)."""
Rout=[]
if not all(isinstance(r, Read) for r in Rset):
raise TypeError("The argument to SubReads must be a list of Read objects!")
for r in Rset:
_Sset=set(_s for _s in Sset)
nrdict = dict()
rdict = r.GetDict() #dict(ast.literal_eval(repr(r)))
for s in rdict.keys():
if s in _Sset:
_Sset.remove(s)
nrdict[s]=rdict[s]
if len(nrdict) >= min_subread_length:
Rout.append(Read(nrdict))
else:
Rout.append(Read(dict()))
return Rout
#diclst=[{1:1, 2:1, 3:1, 4:1}, {3:1, 4:1, 5:0, 6:0}, {4:0, 5:1, 6:1},
#{4:0, 5:1,6:1, 7:0}, {5:0, 6:0, 7:1}, {5:1, 6:1, 7:0}]
#a=Read({14:1, 15:1, 16:0, 21:0, 22:1})
#b=Read({14:1, 15:1, 16:0})
#Read(ast.literal_eval(repr(a)))
#str(a)
#str(b)
#a.GetalleleSet(16,22)
#c=Read(dict())
#nlst=map(lambda rdict: Read(rdict), diclst)
#for n in range(1,8):
# for r in nlst:
# print("{0}->{1}".format(n, repr(SemiRead(r, n))))
#SubReads(nlst, {1,2,3,4,5})
#a='1 HWI-ST163_0386:6:27:3566:6857#P7PEM12p 14 01010 ;:;78'.split()
#b='2 HWI-ST163_0386:6:6:10650:188444#P7PEM12p_MP 5 0100 17 000 ;;7<779'.split()
#del a[1], b[1], a[-1], b[-1]
#Read(a[0], int(a[1]), a[2])
#Read(b[0], (int(b[1]), int(b[3])), (b[2], b[4]))
#Read(b[0], (b[1], b[3]), (b[2], b[4]))
File added
#!/usr/bin/env python
import numpy
from distutils.core import setup
from Cython.Build import cythonize
#from distutils.extension import Extension
#from Cython.Distutils import build_ext
#ext_modules = [
# Extension("c_haplotypes", ["c_haplotypes.pyx"]),
# Extension("c_reads", ["c_reads.pyx"]),
# Extension("c_branchprune", ["c_branchprune.pyx"]),
# Extension("HapTreePop", ["HapTreePop.pyx"])
#]
#setup(
#name = 'TriPoly',
# ext_modules = ext_modules,
# cmdclass = {'build_ext': build_ext}
#)
setup(
#name = 'TriPoly',
ext_modules = cythonize("logprob2.pyx"),
include_dirs=[numpy.get_include()]
)
#python setup.py build_ext --inplace
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment