vcfmerger.py 37 KB
Newer Older
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#!/usr/bin/python
"""

"""
import os, sys
import gzip
import datetime
import time
import pprint
import math
import copy
from pprint import pprint as pp

sys.path.insert(0, '.')
from filemanager import getFh,openvcffile,openfile,checkfile,makeIndexFile,readIndex

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
17
18
#ulimit -n 4096

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#try:
#    sys.path.insert(0, 'aux/pypy-2.0/pypy-2.0-beta2/pypy-pypy-4b60269153b5/')
#    from rpython.rlib.jit import JitDriver, purefunction
#except:
#    print "no pypy"


#./vcfmerger.py short.lst
#./vcfsimplify.py short.lst.vcf.gz
#./pypy ./vcffiltergff.py --input=short2.lst.vcf.gz.simplified.vcf.gz -g ITAG2.3_gene_models.gff3.exon.gff3
#./pypy ./vcffiltergff.py --input=short2.lst.vcf.gz.simplified.vcf.gz -g ITAG2.3_gene_models.gff3.exon.gff3 --protein S_lycopersicum_scaffolds.2.40.fa

#./pypy ./vcfconcat.py short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz
#./pypy ./alnmerger.py short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz.aln *ch??.aln
#for aln in *.aln; do echo $aln; clustalw -TREE -QUICKTREE -INFILE=$aln &\ ; done


#./pypy ./vcfconcat.py --fasta -i short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz --ignore SL2.40ch00
#rm short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz.fa
#./pypy alnmerger.py short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz.fa *ch??.fasta
#for fa in *ch??.fasta; do echo $fa;  bash runFastTree.sh $fa; done
#bash runFastTree.sh short2.lst.vcf.gz.simplified.vcf.gz.filtered.vcf.gz.fa

#./FastTreeMP -fastest -gtr -gamma -nt -bionj -boot 100 short2.lst.vcf.gz.simplified.vcf.gz.SL2.40ch04_42945354-42947959.fasta | tee short2.lst.vcf.gz.simplified.vcf.gz.SL2.40ch04_42945354-42947959.fasta.tree
#./FastTreeMP -fastest -gtr -gamma -nt -bionj -boot 100 short2.lst.vcf.gz.simplified.vcf.gz.SL2.40ch04.fasta | tee short2.lst.vcf.gz.simplified.vcf.gz.SL2.40ch04.fasta.tree 2>&1 | tee short2.lst.vcf.gz.simplified.vcf.gz.SL2.40ch04.fasta.tree.log

#./pypy alnmerger.py short2.lst.vcf.gz.simplified.vcf.gz.fasta *ch??.fasta


#./vcfstats.py short2.lst.vcf.gz
#./vcftree.py short2.lst.vcf.gz.report.csv
#sumtrees.py --to-newick --no-meta-comments --no-summary-metadata --output short2.lst.vcf.gz.gt1.vcf.gz.simplified.vcf.gz.report.csv.nj *.nj



#grep -P "\tgene\t" ITAG2.3_gene_models.gff3 > ITAG2.3_gene_models.gff3.gene.gff3
#grep -P "\texon\t" ITAG2.3_gene_models.gff3 > ITAG2.3_gene_models.gff3.exon.gff3

#python
#{'count total': 1307102,
# 'count unique': 1205820,
# 'ela': datetime.timedelta(0, 94, 947935),
# 'ela_str': '0:01:34.947935',
# 'end time': '2013-04-24T18:59:54.507488',
# 'first': 280,
# 'last': 21805703,
# 'snp_per_kb_t': 59.943125887755144,
# 'snp_per_kb_u': 55.29837767670228,
# 'speed_t': 13766.513194836729,
# 'speed_u': 12699.802265315197,
# 'start time': '2013-04-24T18:58:19.559448'}
#PYPY purefunction
#{'count total': 1307102,
# 'count unique': 1205820,
# 'ela': datetime.timedelta(0, 31, 721854),
# 'ela_str': '0:00:31.721854',
# 'end time': '2013-04-24T19:01:22.663991',
# 'first': 280,
# 'last': 21805703,
# 'snp_per_kb_t': 59.943125887755144,
# 'snp_per_kb_u': 55.29837767670228,
# 'speed_t': 41205.09475896333,
# 'speed_u': 38012.28011452294,
# 'start time': '2013-04-24T19:00:50.942005'}
#PYPY vanilla
#{'count total': 1307102,
# 'count unique': 1205820,
# 'ela': datetime.timedelta(0, 32, 109961),
# 'ela_str': '0:00:32.109961',
# 'end time': '2013-04-24T19:03:42.957995',
# 'first': 280,
# 'last': 21805703,
# 'snp_per_kb_t': 59.943125887755144,
# 'snp_per_kb_u': 55.29837767670228,
# 'speed_t': 40707.056604646765,
# 'speed_u': 37552.832904406205,
# 'start time': '2013-04-24T19:03:10.847895'}



FHDOPEN       = 0
FHDJUSTCLOSED = 1
FHDCLOSED     = 2

"""
Constants
"""
SIMP_NO_SIMPLIFICATION =    0 # no simplification
SIMP_SNP               = 2**1 # simplify SNPs
SIMP_EXCL_HETEROZYGOUS = 2**2 # exclude heterozygous
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
109
110
111
SIMP_EXCL_HOMOZYGOUS   = 2**3 # exclude homozygous
SIMP_EXCL_INDEL        = 2**4 # exclude indels
SIMP_EXCL_SINGLETON    = 2**5 # exclude singletons
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148


def getBits(val):
    """
    Split the bits from the configuration
    """
    #print "getting bits from %3d" % val
    #print "size of value     %3d" % sys.getsizeof(val)
    try:
        sizeofval = sys.getsizeof(val)
    except: # pypy
        sizeofval = 24

    res = []
    for offset in range(sizeofval * 8):
        mask = 1 << offset
        bitv = val & mask

        #print "offset %3d" % offset
        #print "mask   %3d" % mask
        #print "bit    %3d" % bitv
        #print

        if bitv > 0: res.append( True  )
        else       : res.append( False )

    return res



class vcfResult(object):
    """
    Main class controlling the merging and filtering of vcf files
    """
    simplifybits     = None
    simplifySNP      = None
    excludeHET       = None
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
149
    excludeHOMO      = None
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
150
151
152
153
    excludeINDEL     = None
    excludeSingleton = None
    noncarefiles     = None
    simpliStats      = None
154
155
156
157
158
    prints           =    0
    printsReal       =    0
    printsRealLast   =    0
    print_every      = 1000
    linelen          =  100
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
159

160
    def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, noncarefiles=[], translation=None ):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
161
162
163
164
165
166
        self.result       = None
        self.chom         = None
        self.pos          = None
        self.chrCount     = None
        self.posCount     = None
        self.simplify     = simplify
167
        self.translation  = translation
168

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
169
170
171
172
        if vcfResult.simpliStats is None:
            vcfResult.simpliStats = {
                'Heterozygous Dest' : 0,
                'Heterozygous Indel': 0,
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
173
                'Homozygous'        : 0,
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
                'Source Indel'      : 0,
                'Singleton 1'       : 0,
                'Singleton 2'       : 0,
                'Singleton 3'       : 0,
                'Ok'                : 0,
            }


        if vcfResult.simplifybits is None:
            #print "simplifying"
            vcfResult.simplifybits     = getBits(simplify)
            #print "simplifying :: simplify bits  ", self.simplifybits
            vcfResult.simplifySNP      = vcfResult.simplifybits[ int( math.log( SIMP_SNP               , 2 ) ) ] # simplify SNP
            vcfResult.excludHET        = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_HETEROZYGOUS , 2 ) ) ] # exclude heterozygous
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
188
            vcfResult.excludHOMO       = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_HOMOZYGOUS   , 2 ) ) ] # exclude homozygous
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
189
190
            vcfResult.excludeINDEL     = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_INDEL        , 2 ) ) ] # exclude indels (len > 1)
            vcfResult.excludeSingleton = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_SINGLETON    , 2 ) ) ] # exclude single species SNPs
191

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
192
193
            print "simplifying :: SNP          [%3d, %3d] %s" % ( SIMP_SNP              , math.log(SIMP_SNP               , 2 ), str(vcfResult.simplifySNP     ) )
            print "simplifying :: Heterozygous [%3d, %3d] %s" % ( SIMP_EXCL_HETEROZYGOUS, math.log(SIMP_EXCL_HETEROZYGOUS , 2 ), str(vcfResult.excludHET       ) )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
194
            print "simplifying :: Homozygous   [%3d, %3d] %s" % ( SIMP_EXCL_HOMOZYGOUS  , math.log(SIMP_EXCL_HOMOZYGOUS   , 2 ), str(vcfResult.excludHOMO      ) )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
195
196
            print "simplifying :: Indel        [%3d, %3d] %s" % ( SIMP_EXCL_INDEL       , math.log(SIMP_EXCL_INDEL        , 2 ), str(vcfResult.excludeINDEL    ) )
            print "simplifying :: Singleton    [%3d, %3d] %s" % ( SIMP_EXCL_SINGLETON   , math.log(SIMP_EXCL_SINGLETON    , 2 ), str(vcfResult.excludeSingleton) )
197
198
199
200

            print 'Progress Legend:'
            print ' print every       :', vcfResult.print_every
            print ' Heterozygous Dest : h'
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
201
            print ' Homozygous        : o'
202
203
204
205
206
207
208
209
            print ' Heterozygous Indel: I'
            print ' Homozygous Indel  : i'
            print ' Source Indel      : s'
            print ' Singleton 1       : 1'
            print ' Singleton 2       : 2'
            print ' Singleton 3       : 3'
            print ' Ok                : .'

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
            #sys.exit(0)

        #TODO: IMPLEMENT
        if vcfResult.noncarefiles is None:
            vcfResult.noncarefiles = noncarefiles

    def simplifier(self, srcs):
        """
        Reads each register and simplifies it
        """
        #if len(sourc) > 1:
        #todo: if len(src) != len(tgt)
        #{'G': {'A': ['S neorickii (056)']},  'GTAT': {'GTATATACCTATCTTTTCTTTCTAT': ['Moneymaker (001)', 'S cheesemaniae (053)']}}
        #{'G': {'A': ['S neorickii (056)']},  'GTAT': {'GTATATACCTATCTTTTCTTTCTAT': ['Moneymaker (001)', 'S cheesemaniae (053)']}}

        #SL2.40ch00      23317   .       T       TC      .       PASS    NV=3;NW=2;NS=13;NT=10;NU=9      FI      S chilense (064),S chilense (065),S corneliomulleri lycopersicon so (025),S huaylasense (062),S huaylasense (063),S pennellii (073),S peruvianum (060),S pimpinellifolium (049),Unknown (075)
        #SL2.40ch00      23317   .       TAAAAAA TAAAAAAA        .       PASS    NV=3;NW=1;NS=13;NT=3;NU=3       FI      S cheesemaniae (054),S pimpinellifolium (044),S pimpinellifolium (045)


        if len(srcs) == 0: return srcs
        simpl = {}
        for src in srcs:
            dsts = srcs[ src ]
            if src not in simpl: simpl[src] = {}
            simplsrc = simpl[src]

            for dst in dsts:
                files = dsts[ dst ]

239
240
                if dst.find('|') != -1:
                    for dstl in dst.split('|'):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
                        if dstl not in simplsrc: simplsrc[dstl] = []
                        simplsrc[dstl].extend(files)
                else:
                    if dst not in simplsrc: simplsrc[dst] = []
                    simplsrc[dst].extend(files)

        for src in simpl:
            for dst in simpl[src]:
                simpl[src][dst] = sorted(list(set(simpl[src][dst])))

        #print
        #print "SIMPLIFIER :: ORIGINAL   :", pprint.pformat(srcs ).replace("\n", " ")
        #print "SIMPLIFIER :: SIMPLIFIED :", pprint.pformat(simpl).replace("\n", " ")

        return simpl
        #return srcs

258
    def printprogress(self, msg, key=None, skip=0):
259
260
261
262
263
264
265
266
267
268
269
270
271
272
        vcfResult.prints += 1

        if skip != 0:
            if key is None:
                if vcfResult.prints % skip == 0:
                    sys.stderr.write(msg)
                    vcfResult.printsReal += 1

            else:
                if vcfResult.simpliStats[key] % skip == 0:
                    sys.stderr.write(msg)
                    vcfResult.printsReal += 1
        else:
            sys.stderr.write(msg)
273
            vcfResult.printsReal += 1
274

275
276
        if vcfResult.printsReal % vcfResult.linelen == 0 and vcfResult.printsReal != vcfResult.printsRealLast:
            vcfResult.printsRealLast = vcfResult.printsReal
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
277
            sys.stderr.write('\nLast {:14,d}\n'.format( vcfResult.prints ) )
278
279
280

        sys.stderr.flush()

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
    def __str__(self):
        #SL2.40ch01      2118853 .       T       G       222     .       DP=40;AF1=1;CI95=1,1;DP4=0,0,16,23;MQ=60;FQ=-144        GT:PL:DP:GQ     1/1:255,117,0:39:99

        restr = ""
        if self.result is None:
            print "current result is none"
            return restr


        #print "exporting chr %s pos %d with %d results" % ( self.chr, self.pos, len( self.result ) )
        #outFileHandle << val.chr << "\t" << val.pos;
        #
        #for ( int fileNum = 0; fileNum < numFiles; ++fileNum ) {
        #    //outFileHandle << " cov " << val.result[fileNum].cov << " chr " << val.result[fileNum].chr << " pos " << val.result[fileNum].pos;
        #    outFileHandle << "\t" << val.result[fileNum].cov;
        #    if ( val.result[fileNum].cov > 0 ) {
        #        statistics.add( val.result[fileNum].cov );
        #    }
        #}
        #outFileHandle << "\n";


        srcs   = {}
        rcount = 0
        #print "RESULTS", self.result
        for register in self.result:
            rcount += 1
            if register is None:
                print "register #%d is empty" % register
                sys.exit( 1 )
311

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
312
313
            #print register

314
315
316
            #if rcount % 100 == 0:
            #    sys.stderr.write("\n")
            #    sys.stderr.flush()
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336

            chrom    = register['chrom'   ]
            posit    = register['pos'     ]
            sourc    = register['src'     ]
            desti    = register['dst'     ]
            descr    = register['desc'    ]
            state    = register['state'   ]
            filedesc = register['filedesc']

            if chrom != self.chrom:
                print "wrong chromosome %s vs %s" % ( chrom, self.chrom )
                sys.exit( 1 )

            if posit != self.pos:
                print "wrong position %d vs %d" % ( posit , self.pos )
                sys.exit( 1 )


            if ( vcfResult.excludHET    ) and ( desti.find(',') != -1 ):
                vcfResult.simpliStats['Heterozygous Dest'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
337
                print "heterozygous: %s" % desti
338
                self.printprogress('h', key='Heterozygous Dest', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
339
340
                #return ""
                continue
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
341
342
343
            
            if ( vcfResult.excludHOMO   ) and ( set(sourc.split(',')) == set(desti.split(',')) ):
                vcfResult.simpliStats['Homozygous'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
344
                print "homozygous  : %s" % desti
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
345
346
347
                self.printprogress('o', key='Homozygous', skip=vcfResult.print_every)
                #return ""
                continue
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
348
349

            if ( vcfResult.excludeINDEL ):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
350
                if desti.find(',') != -1: # is heterozygous
351
                    destis = desti.split('|')
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
352
353
354
355
                    for alt in destis:
                        if len(alt) > 1:
                            vcfResult.simpliStats['Heterozygous Indel'] += 1
                            #print "has het indel: %s > %s" % ( desti, alt )
356
                            self.printprogress('I', key='Heterozygous Indel', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
357
358
359
360
361
362
363
                            #return ""
                            continue

                else: # homozygous
                    if ( len( desti ) > 1 ):
                        vcfResult.simpliStats['Homozygous Indel'] += 1
                        #print "has hom indel: %s" % desti
364
                        self.printprogress('i', key='Homozygous Indel', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
365
366
367
368
369
370
371
                        #return ""
                        continue

                if len(sourc) > 1:
                    vcfResult.simpliStats['Source Indel'] += 1
                    # todo: if len(src) != len(tgt)
                    #print "not single nucleotide source: %s" % str(register)
372
                    self.printprogress('s', key='Source Indel', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
373
374
375
376
377
378
379
380
381
382
383
384
385
386
                    #return ""
                    continue



            if sourc not in srcs:
                srcs[ sourc ]          = {}
            dsts = srcs[ sourc ]

            if desti not in dsts:
                dsts[ desti ] = []
            files = dsts[ desti ]

            if vcfResult.simplifySNP:
387
                files.extend( descr.split('|') )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410

            else:
                if filedesc in files:
                    print "source already present"
                    sys.exit(1)

                files.append( filedesc )

        if len(srcs) == 0: return ""

        if vcfResult.simplifySNP:
            srcs = self.simplifier(srcs)

        nv    = 0
        ns    = 0



        allspps = []
        for src in srcs:
            for dst in srcs[src]:
                nv += 1
                allspps.extend( srcs[src][dst] )
411

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
412
413
414
415
416
        ns = len( set(allspps) )

        if (vcfResult.excludeSingleton) and (ns == 1):
            vcfResult.simpliStats['Singleton 1'] += 1
            #print "singleton ns: %s" % str(srcs)
417
            self.printprogress('1', key='Singleton 1', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
418
419
420
421
422
423
424
425
426
            return ""

        for src in sorted( srcs ):
            dsts = srcs[ src ]
            nt   = 0

            ntspps = []
            for dst in dsts:
                ntspps.extend( dsts[ dst ] )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
427

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
428
429
430
431
432
433
434
435
436
437
438
            nt = len(set(ntspps))

            nw = len(dsts)

            for dst in sorted( dsts ):
                files = dsts[ dst ]
                nu    = len( files )

                if (vcfResult.excludeSingleton) and (nu == 1):
                    vcfResult.simpliStats['Singleton 2'] += 1
                    #print "singleton 1: %s" % str(srcs)
439
                    self.printprogress('2', key='Singleton 2', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
440
441
                    continue

442
                files_str = "|".join( files )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
443
444
445
                info_str  = "NV=%d;NW=%d;NS=%d;NT=%d;NU=%d" % ( nv, nw, ns, nt, nu )

                #                      chr  pos         id   ref  alt  qual filter  info      FORMAT filenames
446
447
448
449
450
451
                chrom_name = chrom
                if self.translation:
                    if chrom in self.translation:
                        chrom_name = self.translation[ chrom_name ]

                restr += "\t".join([ chrom_name, str(posit), '.', src, dst, '.', 'PASS', info_str, 'FI',  files_str]) + "\n"
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
452
453
454
455

        if len(restr) == 0:
            vcfResult.simpliStats['Singleton 3'] += 1
            #print "singleton l: %s" % str(srcs)
456
            self.printprogress('3', key='Singleton 3', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
457
458
459

        else:
            vcfResult.simpliStats['Ok'] += 1
460
            self.printprogress('.', key='Ok', skip=vcfResult.print_every)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507

        return restr



class vcfRegister(dict):
    """
    Class containg all information of a given position
    """
    def __init__(self, *args, **kwargs):
        #if 'defaultres' in kwargs:
        #    self.defaultres = kwargs['defaultres']
        #else:
        #    self.defaultres = None

        super(vcfRegister, self).__init__(*args, **kwargs)
        self.__dict__ = self

    #def __getattr__(self, attr):
    #    if attr in self:
    #        return self[attr]
    #    else:
    #        return self.defaultres
    #
    #__setattr__ = dict.__setitem__

    def __repr__(self):
        res = "CHROM %s POS %s SRC %s DST %s DESC '%s' STATE %s FILENAME '%s' FILEDESC '%s' FILECARE %s" % (
                self['chrom'   ],
            str(self['pos'     ]),
                self['src'     ],
                self['dst'     ],
                self['desc'    ],
            str(self['state'   ]),
                self['filename'],
                self['filedesc'],
            str(self['filecare'])
        )
        return res



class vcfFile(object):
    """
    Reads a VCF file and keeps the positions until NEXT is called.
    Used to facilitate the parallel reading of VCF files where all files have to be in the same coordinate.
    """
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
508
509
    ignores = ['0|0', '0/0', './.'] # reference, nocov
    def __init__(self, infile, filedesc, filecare, fileCol):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
510
511
512
        self.infile   = infile
        self.filedesc = filedesc
        self.filecare = filecare
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
513
        self.fileCol  = fileCol
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
514

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
515
        checkfile(infile)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
516
517

        #print "  opening %s" % infile
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
518
519
520
521
522
        self.names                = []
        self.infhd                = openvcffile(infile, 'r')
        self.state                = FHDOPEN
        self.currLine             = ""
        self.register             = vcfRegister()
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
523
524
525
526
527
528
529
530
        self.register['filename'] = infile
        self.register['filedesc'] = filedesc
        self.register['filecare'] = filecare

    def next(self):
        """
        Requests the next line in the VCF file as a register
        """
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
531
532
533
534
535
536
537
538
        while self.state == FHDOPEN:
            self.register['chrom'] = None
            self.register['pos'  ] = None
            self.register['src'  ] = None
            self.register['dst'  ] = None
            self.register['desc' ] = None
            self.register['state'] = self.state

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
539
            currLine = ""
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
540
541
            again    = False

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
542
543
544
            while currLine is not None and len(currLine) == 0:
                currLine = self.infhd.readline()
                if len( currLine ) == 0 or currLine[-1] != "\n": # EOF
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
545
546
                    currLine   = None
                    again      = True
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
547
                    self.state = FHDJUSTCLOSED
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
548
549
550
                    print 'f',
                    break

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
551
                elif len(currLine) == 1: # EMPTY LINE
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
552
553
554
555
                    again = True
                    print 'e',
                    break

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
556
557
558
                else: #normal line
                    pass

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
559
560
561
562
            if again:
                print 'a',
                continue

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
563
564
565
566
567
568
            #0               1       2       3       4       5       6       7                                                       8               9
            #SL2.40ch01      2118853 .       T       G       222     .       DP=40;AF1=1;CI95=1,1;DP4=0,0,16,23;MQ=60;FQ=-144        GT:PL:DP:GQ     1/1:255,117,0:39:99
            currLine = currLine.strip()
            cols     = currLine.split("\t")

            if len(cols) == 0:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
569
570
                print 'n',
                continue
571

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
572
573
            elif currLine[0] == "#":
                if currLine.startswith('##sources='):
574
                    self.names = currLine[10:].split('|')
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
575
                    pp(self.names)
576

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
577
578
579
580
581
582
                elif currLine.startswith('##numsources='):
                    numsources = int(currLine[13:])
                    if numsources != len(self.names):
                        print "error parsing sources"
                        print "num sources", numsources,"!=",len(self.names),sorted(self.names)
                        sys.exit(1)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
583

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
584
585
586
                    else:
                        print "num sources:",numsources

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
587
                continue
588

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
589
590
591
592
593
            assert len( cols ) > 9, str(cols)

            descIndex = 9 + self.fileCol - 1
            info      = cols[8]
            desc      = cols[descIndex] # 1 based
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
594
            #print cols
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632

            self.register['chrom'] =     cols[0]
            self.register['pos'  ] = int(cols[1])
            self.register['src'  ] =     cols[3]
            self.register['dst'  ] =     cols[4]
            self.register['desc' ] =     desc

            if ':'  in desc and ':'  in info and 'GT' in info:
                #assert ':'  in info, info
                #assert 'GT' in info, info
                #assert ':'  in desc, desc
    
                #print "has desc"
                infoC  = info.split(':')
                assert len(infoC) > 1, info
                #print "  info" , info
                #print "  infoC", infoC
    
                descC = desc.split(":")
                assert len(descC) > 1, desc
                #print "  desc" , desc
                #print "  descC", descC
    
                gtpos = info.index('GT')
                #print "  GT pos", gtpos
    
                assert len(infoC) == len(descC), info + "|" + desc
                #print "   len infoC == len descC", infoC, descC
    
                gtDesc   = descC[gtpos]
                gt0, gt1 = (None, None)
    
                if   '/' in gtDesc:
                    gt0, gt1 = gtDesc.split('/')
    
                elif '|' in gtDesc:
                    gt0, gt1 = gtDesc.split('|')
    
633
                else:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
634
635
636
637
638
639
640
641
642
643
644
                    assert False, 'unknown info fomat: %s (%s, %s)' % (gtDesc, info, desc)
    
                #print "   gtinfo", gtinfo
                #print "    gt1", gt0
                #print "    gt2", gt1
                s = set([gt0, gt1])
    
                if gt0 == '.':                      # skip no coverage
                    #print '.',
                    continue
    
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
645
646
647
648
649
650
                if (gt0 == '0'): # homozygous identical to reference
                    if len(s) == 1:
                        #print '0',
                        continue
                    else:
                        self.register['dst'  ] = ",".join(sorted(list(set(self.register['src'  ].split(",") + self.register['dst'  ].split(",")))))
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
    
                #print 'v'
                #if any([gt == '0' for gt in (gt0, gt1)]): #
                #print "has desc"
                #print " info", info
                #print "  GT pos", gtpos
                #print "  desc", desc
                #print "   len info == len desc", info, desc
                #print "   gtinfo", gtinfo
                #print "    gt1", gtinfo[ 0]
                #print "    gt2", gtinfo[-1]
                #print "   adding src to dst", self.register['src'  ], self.register['dst'  ]
    
                #if gt0 == '0' or gt1 == '0': # if heretozygous and has reference, make it explicit
                #    print gtinfo, 'h'
                #    self.register['dst'  ] = ",".join(sorted(list(set(self.register['src'  ].split(",") + self.register['dst'  ].split(",")))))
                #    #print "   added  src to dst", self.register['dst'  ]

            #print 'V',
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
670

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
671
            break
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
672

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
673
        self.register['state'] = self.state
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703

    def close( self ):
        """
        Close filehandle
        """
        self.state = FHDCLOSED
        self.infhd.close()

    def currRegister(self):
        """
        Get current register
        """
        if self.state == FHDOPEN:
            return self.register
        else:
            return None

    def getstate(self):
        """
        Get file state
        """
        return self.state



class vcfHeap(object):
    """
    Maintain the latest line in all VCF files at the same time
    """

704
705
706
707
708
    def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, translation=None):
        self.translation  = translation

        print "VCF Heap Translation", translation

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
        self.fileNames    = [] #vecString   fileNames;
        self.filehandle   = [] #vecIfstream filehandle;
        self.fileStates   = [] #vecBool     fileStates;
        self.filePos      = {} #map<   string, int          > filePos;
        self.numOpenFiles = 0 #int         numOpenFiles;

        self.lastPos      = 0  #int            lastPos;
        self.lastChr      = "" #string         lastChr;
        self.heapHead     = [] #vecCovRegister heapHead;
        self.ctime        = datetime.datetime.now().isoformat()
        self.stats        = {}
        self.stats        = {
            'chroms'    : {},
            'start time': self.ctime
        }
        self.simplify     = simplify
        self.noncarefiles = []
726
        self.currResult   = vcfResult( simplify=self.simplify, noncarefiles=self.noncarefiles, translation=translation )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
727
        self.currResult   = None
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
728
        
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
729
730
731
732
733
734
735
736
        print self.ctime

    def addFile( self, filecare, fileName, filedesc ):
        """
        Adds file to be taken track of
        """
        print "adding file to heap"
        filecare = filecare == '1'
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
        fileCol  = 1

        if '|' in fileName:
            cols     = fileName.split('|')
            assert len(cols) == 2
            fileName =     cols[0]
            fileCol  = int(cols[1])
            print "FILENAME %s COL %s" % (fileName, fileCol)

        if not os.path.exists( fileName ):
            print "vcf file %s does not exists" % fileName
            sys.exit( 1 )

        else:
            print "vcf file %s does exists" % fileName

        assert len(filedesc) > 0

        self.filePos[ (fileName, filedesc, filecare, fileCol) ] = len( self.fileNames )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
756
757
758
759
760
761
762

        if not filecare:
            self.noncarefiles.append(filedesc)

        self.fileNames.append( (fileName, filedesc, filecare) )

        print "creating vcf handler"
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
763
        self.filehandle.append( vcfFile( fileName, filedesc, filecare, fileCol ) );
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
764
765
766
767
768
        print "created vcf handler"
        self.fileStates.append( True );
        self.numOpenFiles += 1

        print "getting first register"
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
769
        firstRegister = self.getRegister( self.filePos[ ( fileName, filedesc, filecare, fileCol ) ] );
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
        print "got first register"

        self.heapHead.append( firstRegister );
        print "added to heap"

    def getFileNames(self):
        """
        Lists all files
        """
        return [ x[0] for x in self.fileNames ]

    def getFileDesc(self):
        """
        Returns list of file descriptions (pretty name)
        """
        if self.simplify:
            return self.filehandle[0].names
        else:
            return [ x[1] for x in self.fileNames ]

    def getCurrChr(self):
        """
        Returns current chromosome
        """
        return self.lastChr

    def getCurrpos(self):
        """
        Returns current position
        """
        return self.lastPos

    def getNumFiles(self):
        """
        Returns number of files
        """
        if self.simplify:
            return len( self.filehandle[0].names )
        else:
            return len( self.fileNames )

    def getNumOpenFiles(self):
        """
        Returns number of files still open
        """
        if self.simplify:
            return len( self.filehandle[0].names )
        else:
            return self.numOpenFiles

    def isempty( self ):
        """
        Returns if there's still open files
        """
        #print "is empty? %d\n" % self.numOpenFiles
        return self.numOpenFiles == 0

    def getRegister( self, fileNum ):
        """
        Gets the current register of a file
        """
        infilehandle = self.filehandle[ fileNum ]
        infilehandle.next()
        currRegister = infilehandle.currRegister()

        if currRegister is None:
            infilehandle.close()
            self.fileStates[ fileNum ] = False
            self.numOpenFiles -= 1
            return None

        return currRegister;

    def getFilterStats(self):
        """
        Gets the number of simplifications performed in the VCF files
        """
        return pprint.pformat(vcfResult.simpliStats)

    def next( self ):
        """
        Gets the next position shared by all files
        """
853
        self.currResult = vcfResult( translation=self.translation )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
854
855
        response        = []

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
856
857
        chromCount      = 0
        posCount        = 0
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
858
859
860
861
862
863
864
865
866
867
868
869

        has_more_chr = True
        if self.lastChr == "":
            poses = [ heap.chrom for heap in self.heapHead if (heap is not None) and (heap.state == FHDOPEN) ]
            if len(poses) > 0:
                self.lastChr = sorted( poses )[0]

                self.stats['chroms'][ self.lastChr ] = {
                    'start time'  : datetime.datetime.now().isoformat(),
                    'count unique': 1,
                    'count total' : 0
                }
870

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
871
                print
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
872
                print "   STARTING CHROMOSOME %s" % self.lastChr
873

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
874
875
876
            else:
                has_more_chr = False
                return None
877

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
        else:
            self.stats['chroms'][ self.lastChr ][ 'count unique' ] += 1

        poses = []
        if has_more_chr:
            poses = [ heap.pos for heap in self.heapHead if ((heap is not None) and (heap.chrom == self.lastChr ) and ( heap.state == FHDOPEN )) ]

        if (len( poses ) > 0):
            lastChromData = self.stats['chroms'][ self.lastChr ]
            self.lastPos  = min( poses )

            for fileNum in xrange( len( self.fileNames ) ):
                #print "filenum %d last chr %s lasPos %d len %d" % ( fileNum, self.lastChr, self.lastPos, len( self.fileNames ) )

                if self.fileStates[ fileNum ]: # if file not closed
                    #print " file open"
                    heapdata = self.heapHead[ fileNum ]

                    if heapdata.state == FHDOPEN: # is valid
                        #print "  heap open"

                        if heapdata.chrom == self.lastChr: # if chrom is correct
                            #print "    heap head chr ok %s" % self.lastChr
                            chromCount += 1 # chrom still exists

                            while heapdata.pos == self.lastPos: # if pos is correct
                                #print "    heap head pos ok %d" % self.lastPos
                                #print "!"*200
                                lastChromData[ 'count total' ] += 1

                                posCount += 1
                                #print heapdata
                                response.append( copy.copy( heapdata ) ) # add to response

                                self.heapHead[ fileNum ] = self.getRegister( fileNum ) # add to response
                                heapdata = self.heapHead[ fileNum ]
914
                                if heapdata       is None        : break
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
                                if heapdata.state != FHDOPEN     : break
                                if heapdata.chrom != self.lastChr: break
                            #print "    heap head pos NOT ok %d" % self.heapHead[ fileNum ].pos
                        else:
                            #print "    heap head chr NOT ok %s" % self.heapHead[ fileNum ].chr
                            pass
                    else:
                        #print "    file is closed"
                        pass
                else:
                    #print "    state is closes"
                    pass

            #print "    end for loop"

            if 'first' not in lastChromData:
                lastChromData[ 'first' ] = self.lastPos
                lastChromData[ 'ela'   ] = datetime.datetime.now()

        else: # no more positions to current chromosome
            #print "    len poses == 0"
            chromData = self.stats['chroms'][ self.lastChr ]
            chromData[ 'last'         ] = self.lastPos
            chromData[ 'end time'     ] = datetime.datetime.now().isoformat()
            chromData[ 'ela'          ] = datetime.datetime.now() - chromData[ 'ela' ]
            chromData[ 'ela_str'      ] = str( chromData[ 'ela' ] )
            chromData[ 'snp_per_kb_t' ] = ( chromData[ 'count total'  ] * 1.0 ) / self.lastPos * 1000.0
            chromData[ 'snp_per_kb_u' ] = ( chromData[ 'count unique' ] * 1.0 ) / self.lastPos * 1000.0
            chromData[ 'speed_t'      ] = ( chromData[ 'count total'  ] * 1.0 ) / chromData[ 'ela' ].total_seconds()
            chromData[ 'speed_u'      ] = ( chromData[ 'count unique' ] * 1.0 ) / chromData[ 'ela' ].total_seconds()
            pprint.pprint( chromData )

            self.lastPos = 0
            self.lastChr = ""
            #sys.exit(0)
            print "empty. nexting"
            return self.next()

        self.currResult.result   = response
        self.currResult.chrom    = self.lastChr
        self.currResult.pos      = self.lastPos
        self.currResult.chrCount = chromCount
        self.currResult.posCount = posCount

        self.lastPos += 1

        return self.currResult

    def getVcfHeader(self):
        """
        Returns a VCF header
        """

        header = """\
##fileformat=VCFv4.1
##fileDate=%s
##source=vcfmerger
##sources=%s
##numsources=%d
##INFO=<ID=NV,Number=1,Type=Integer,Description=\"Number of Unique SNPs Variants in Position\">
##INFO=<ID=NW,Number=1,Type=Integer,Description=\"Number of Unique Source Nucleotides\">
##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Species in Position\">
##INFO=<ID=NT,Number=1,Type=Integer,Description=\"Number of Species Having Source Nucleotide\">
##INFO=<ID=NU,Number=1,Type=Integer,Description=\"Number of Species Having Source and Target Nucleotides\">
##FORMAT=<ID=FI,Number=1,Type=String,Description=\"Source Files\">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tFILENAMES
981
""" % (self.ctime, "|".join( self.getFileDesc() ), self.getNumFiles() )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
982
983
984
985
        return header



986
def main(incsv, translation_str):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
987
988
    outfile    = incsv + '.vcf.gz'

989

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
990
991
992
    if not os.path.exists( incsv ):
        print "input file does not exists. quitting like a whimp"
        sys.exit( 1 )
993

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
994
995
996
    print "reading %s" % incsv


997
    translation = {}
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
998
999
    if translation_str is not None:
        for pair in translation_str.split(';'):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
1000
            src, dst = pair.split(':')
For faster browsing, not all history is shown. View entire blame