vcfmerger_multicolumn.py 17.5 KB
Newer Older
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
1
2
3
4
5
#!/usr/bin/python
import os
import sys
import csv
import datetime
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
6
import argparse
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
7

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
8
from copy        import deepcopy
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
9
10
11
12
13
14
15
16
17
from filemanager import checkfile, openfile, openvcffile
from csv_renamer import get_translation

"""
EX1=vcfmerger/vcfmerger_multicolumn.py
VCF=1001genomes_snp-short-indel_only_ACGTN.vcf.gz
LST=A_thaliana_master_accession_list_1135_20151008.csv

${EX1} ${VCF}
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
18
19
20
21
22
${EX1} --input ${VCF} --table ${LST} --keys tg_ecotypeid --table-values name,othername,CS_number --chromosome-translation "1:Chr1;2:Chr2;3:Chr3;4:Chr4;5:Chr5" --keep-no-coverage


ln -s 1001genomes_snp-short-indel_only_ACGTN.vcf.gz.vcf.gz arabidopsis.csv.vcf.gz
ln -s 1001genomes_snp-short-indel_only_ACGTN.vcf.gz.vcf.gz arabidopsis.csv.vcf.gz.simplified.vcf.gz
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
23
24
25
26
27
28
29
30
31
32
33
"""


class vcf(object):
    def __init__(self, fhd=None, printEvery=100000):
        self.ctime        = datetime.datetime.now().isoformat()
        self.fhd          = fhd
        self.printEvery   = printEvery
        self.lastChrom    = None
        self.numRegs      = 0
        self.numRegsChrom = 0
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
34
        self.stats        = {}
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
35
36
37
38
39
40
41
42
43
44
45
46
47

    def setFhd(self, fhd):
        self.fhd          = fhd

    def printRegister(self, register):
        self.numRegs      += 1
        self.numRegsChrom += 1
        self.printString( self.parseRegister(register) )

        if register['chrom'] != self.lastChrom:
            self.numRegsChrom -= 1
            if self.lastChrom is not None:
                sys.stdout.write(' Chrom {:14,d} Total {:14,d}\n'.format(self.numRegsChrom, self.numRegs))
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
48
49
                print self.stats
                self.fhd.flush()
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

            sys.stdout.write('\nChromosome {0}\n'.format(register['chrom']))
            sys.stdout.flush()

            self.numRegsChrom = 1
            self.lastChrom    = register['chrom']

        if self.numRegs % (self.printEvery / 100) == 0:
            sys.stdout.write('.')
            if self.numRegs % self.printEvery == 0:
                sys.stdout.write(' Chrom {:14,d} Total {:14,d}\n'.format(self.numRegsChrom, self.numRegs))
            sys.stdout.flush()

    def parseRegister(self, register):
        #print register

        stats = register['stats']

        #"UP=%{UP}d;PH=%{PH}d;UN=%{UN}d;NT=%{NT}d;HO=%{HO}d;HE=%{HE}d;NS=%{NS}d"
        ks = {
                'NV': 1,
                'NW': 1,
                'NS': 1,
                'NT': stats['ref'     ],
                'NU': len(register['desc' ].split('|')),
                'UP': stats['unphased'],
                'PH': stats['phased'  ],
                'UN': stats['uncalled'],
                'HO': stats['homo'    ],
                'HE': stats['het'     ]
            }
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
81
82
83
84
        
        for k in stats:
            self.stats[k] = self.stats.get(k, 0) + stats[k]
        
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
        info_str  = ";".join( [ "%s=%d" % ( x, ks[x] ) for x in sorted(ks) ] )

        restr = "\t".join([ register['chrom'], str(register['pos'  ]), '.', register['src'  ], register['dst'  ], '.', 'PASS', info_str, 'FI',  register['desc' ]])

        return restr

    def printString(self, text):
        if self.fhd is not None:
            #print text
            self.fhd.write(text + '\n')

    def printVcfHeader(self, fileDesc):
        fileDesc     = fileDesc
        numFiles     = len(fileDesc)

        """
        Returns a VCF header
        """

        header = """\
##fileformat=VCFv4.1
##fileDate=%s
##source=vcfmerger
##sources=%s
##numsources=%d
##INFO=<ID=NV,Number=1,Type=Integer,Description=\"Number of Unique SNPs Variants in Position\">
##INFO=<ID=NW,Number=1,Type=Integer,Description=\"Number of Unique Source Nucleotides\">
##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Species in Position\">
##INFO=<ID=NT,Number=1,Type=Integer,Description=\"Number of Species Having Source Nucleotide\">
##INFO=<ID=NU,Number=1,Type=Integer,Description=\"Number of Species Having Source and Target Nucleotides\">
##INFO=<ID=UP,Number=1,Type=Integer,Description=\"Number of UnPhased calls\">
##INFO=<ID=PH,Number=1,Type=Integer,Description=\"Number of Phased calls\">
##INFO=<ID=UN,Number=1,Type=Integer,Description=\"Number of Uncalled spps\">
##INFO=<ID=HO,Number=1,Type=Integer,Description=\"Number of Homozygous calls\">
##INFO=<ID=HE,Number=1,Type=Integer,Description=\"Number of Heterozygous calls\">
##FORMAT=<ID=FI,Number=1,Type=String,Description=\"Source Files\">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tFILENAMES""" % (self.ctime, "|".join( fileDesc ), numFiles )

        self.printString(header)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
124
125
126
        self.fhd.flush()


Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
127
128
129
130
131
132






Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
133
134
135
136
137
138
139
140
141
142
143
def main(args):
    parser  = argparse.ArgumentParser(description='Simplify merged VCF file.')
    parser.add_argument('-i', '--input'                 , dest='input'            , required=True,                                    nargs='?', type=str, help='Input file'                                                 )
    parser.add_argument('-o', '--output'                , dest='output'           ,                default=None,                      nargs='?', type=str, help='Output file'                                                 )
    parser.add_argument('-t', '--table'                 , dest='table'            ,                default=None,                      nargs='?', type=str, help='Input table'                                                )
    parser.add_argument('-k', '--keys'                  , dest='keys'             ,                default=None,                      nargs='?', type=str, help='Input keys'                                                 )
    parser.add_argument('-v', '--table-values'          , dest='table_vs'         ,                default=None,                      nargs='?', type=str, help='Input table values'                                         )
    parser.add_argument('-c', '--chromosome-translation', dest='translation'      ,                default=None,                      nargs='?', type=str, help='Translation table to chromosome names [e.g.: 1:Chr1;2:Chr2' )
    parser.add_argument('-s', '--samples'               , dest='samples'          ,                default=None,                      nargs='?', type=str, help='Samples (Columns) to keep [e.g.: Spp1;Spp3;Spp5' )
    parser.add_argument('-n', '--keep-no-coverage'      , dest='keep_no_coverage' ,                              action='store_true',                      help='Keep rows containing no coverage'                           )
    parser.add_argument('-e', '--keep-heterozygous'     , dest='keep_heterozygous',                              action='store_true',                      help='Keep rows hoterozygosity'                           )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
144

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
145
    options = parser.parse_args(args)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
146

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
147
148
149
150
    print "Options", options

    invcf   = options.input
    
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
151
152
153
154
155
    try:
        checkfile(invcf)
        print "input vcf:              %s" % invcf

    except:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
156
157
158
        parser.print_usage()
        #print "%s --input <invcf>" % sys.argv[0]
        print "EG.: %s --input 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" % sys.argv[0]
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
159
160
161
        sys.exit(1)


Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
162
163
164
165
166
167
168
169
170
    outbn = invcf
    if options.output is not None:
        outbn = options.output

    outbn      += (".nc" if options.keep_no_coverage else "") + (".het" if options.keep_heterozygous else "")
    listFile    = outbn + '.list.csv'
    vcfFile     = outbn + '.list.csv.vcf.gz'
    outFile     = outbn + '.list.csv.vcf.gz.simplified.vcf.gz'
    outFileTmp  = outbn + '.list.csv.vcf.gz.simplified.tmp.vcf.gz'
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
171

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
172
173
    if not os.path.exists(vcfFile):
        os.symlink(invcf, vcfFile)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
174
175
176
177
178
179
180
181
182
183

    if os.path.exists( outFile ):
        print "Out File (%s) EXISTS. quitting" % outFile
        sys.exit(1)


    print "Out File:               %s" % outFile


    try:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
184
        intbl  = options.table
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
185
        checkfile(intbl)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
186
        print "Input Table: %s" % intbl
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
187
188
189
190
191

    except:
        intbl  = None


Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
192
193
194
195
    tbl_k  = None
    if options.keys is not None:
        tbl_k  = options.keys
        print "Input Table keys: %s" % tbl_k
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
196
197


Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
198
199
200
201
    tbl_vs = None
    if options.table_vs is not None:
        tbl_vs = options.table_vs.split(',')
        print "Table values: %s" % options.table_vs
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
202
203
204
205
206
207
208
209
210


    data, atad = ( None, None )
    if intbl:
        data, atad = get_translation(intbl, tbl_k, tbl_vs)
        print 'DATA', data
        print 'ATAD', atad


Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
    translation = {}
    if options.translation is not None:
        for pair in options.translation.split(';'):
            src, dst = pair.split(':')
            assert src not in translation
            translation[ src ] = dst

        print "Translation", translation
    else:
        translation = None


    columns = None
    if options.samples is not None:
        columns = options.samples.split(';')
        assert len(columns) > 0, "No Columns %s" % str(columns)

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

    vcf_holder = vcf()
    names      = None
    with openfile(invcf, 'r') as fhdi:
        with openvcffile(outFileTmp, 'w', compresslevel=1) as fhdv:
            vcf_holder.setFhd(fhdv)
            for line in fhdi:
                line = line.strip()

                if len(line) == 0:
                    continue

                if line.startswith("#"): # header
                    print "HEADER", line

                    if line.startswith("##"): # definition lines
                        print "HEADER :: DEF", line

                    else: # column description
                        print "HEADER :: COL", line

                        cols     = line.split("\t")
                        num_cols = len(cols)
                        shared   = cols[:9 ] #CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT
                        names    = cols[ 9:]

                        print "HEADER :: COL :: SHARED", shared
                        #print "HEADER :: COL :: NAMES" , names

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
257
258
259
260
261
                        if columns is not None:
                            cdiff = list( set(columns) - set(names) )
                            assert len( cdiff ) == 0, "Unknown column name: %s" % ( str(cdiff) )

                        with open(listFile, 'wb') as fhdl:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
262
263
                            writer = csv.writer(fhdl, delimiter='\t', quotechar='"')
                            for ln, name in enumerate(names):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
264
265
266
267
                                if columns is not None:
                                    if name not in columns:
                                        continue

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
                                cols = ["1", "%s|%d" % (invcf, ln+1), name]

                                if data is not None:
                                    assert name in data, "name %s not in db" % name

                                    #print "converting %s to %s" % (name, data[name])
                                    cols[2]   = data[name]
                                    names[ln] = data[name]

                                #print "COLS", cols
                                writer.writerow(cols)

                        print "HEADER :: COL :: NAMES" , names

                        vcf_holder.printVcfHeader(names)

                else:
                    cols     = line.split("\t")

                    assert len( cols ) > 9

                    info      = cols[8]
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
290
291
                    assert ':'  in info, line
                    assert 'GT' in info, line
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
292
293
294
295
296
297
298
299
300
301

                    #print "has desc"
                    infoC  = info.split(':')
                    assert len(infoC) > 1
                    #print "  info" , info
                    #print "  infoC", infoC

                    gtpos = info.index('GT')
                    #print "  GT pos", gtpos

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
302
303
304
                    if len(cols[3]) > 1:
                        continue

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
305
306
307
308
309
                    register = {
                        'chrom':     cols[0] ,
                        'pos'  : int(cols[1]),
                        'src'  :     cols[3] ,
                        'dst'  :     cols[4] ,
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
310
                        'desc' :         {}  ,
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
311
312
313
314
315
316
317
318
319
320
                        'stats':         {
                            'unphased' : 0,
                            'phased'   : 0,
                            'uncalled' : 0,
                            'ref'      : 0,
                            'homo'     : 0,
                            'het'      : 0
                        }
                    }
                    descs             =     cols[9:]
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
321
322
                    has_gap           = False
                    is_het            = False
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
323
324

                    for colNum, desc in enumerate(cols[9:]):
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
                        colname = names[colNum]

                        if columns is not None:
                            if colname not in columns:
                                continue

                        if (desc == './.'):
                            register['stats']['uncalled'] += 1
                            if not options.keep_no_coverage:
                                has_gap = True
                                break
                            else:
                                continue

                        assert ':'  in desc, desc + " " + str(cols[9:])
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
340
341
342
343
344
345

                        descC = desc.split(":")
                        assert len(descC) > 1
                        #print "  desc" , desc
                        #print "  descC", descC

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
346
347
348
349
350
351
352
353
354
                        #assert len(infoC) == len(descC), str(infoC) + " " + str(descC) + " " + str(cols[9:])
                        if len(infoC) != len(descC):
                            register['stats']['uncalled'] += 1
                            if not options.keep_no_coverage:
                                has_gap = True
                                break
                            else:
                                continue

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
                        #print "   len infoC == len descC", infoC, descC

                        gtDesc   = descC[gtpos]
                        gt0, gt1 = (None, None)

                        if   '/' in gtDesc:
                            gt0, gt1 = gtDesc.split('/')
                            register['stats']['unphased'] += 1

                        elif '|' in gtDesc:
                            gt0, gt1 = gtDesc.split('|')
                            register['stats']['phased'  ] += 1

                        else:
                            assert False, 'unknown info fomat: %s (%s, %s)' % (gtDesc, info, desc)



                        if gt0 == '.' or gt1 == '.':                      # skip no coverage
                            #sys.stdout.write('.')
                            register['stats']['uncalled'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
376
377
378
                            if not options.keep_no_coverage:
                                has_gap = True
                                break
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
379
380
381

                        else:
                            if len(set([gt0, gt1])) == 1:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
382
                                #sys.stdout.write('o')
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
383
                                register['stats']['homo'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
384
                                
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
385
386
                                if (gt0 == '0'): # homozygous identical to reference
                                    register['stats']['ref'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
387
388
                                    continue
                                    #register['desc' ].append( names[colNum] )
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
389
390

                            else:
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
391
                                #sys.stdout.write('e')
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
392
                                register['stats']['het'] += 1
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
                                if not options.keep_heterozygous:
                                    is_het = True
                                    break
                                else:
                                    continue
                            
                            dstC = register['dst'].split(',')
                            nuc0 = register['src'] if gt0 == '0' else dstC[ int(gt0) - 1 ]
                            nuc1 = register['src'] if gt1 == '0' else dstC[ int(gt1) - 1 ]
                            nucK = (nuc0, nuc1)
                            
                            if nucK not in register['desc' ]:
                                register['desc' ][nucK] = []
                                
                            register['desc' ][nucK].append( names[colNum] )

                            #if gt0 == '0' or gt1 == '0': # if heretozygous and has reference, make it explicit
                            #    #sys.stdout.write('H')
                            #    alts = sorted(list(set(register['src'  ].split(",") + register['dst'  ].split(","))))
                            #    alts = [ a for a in alts if a != '.' ]
                            #    register['dst'  ] = ",".join(alts)
                            #    #print "   added  src to dst", self.register['dst'  ]

                            
                            #register['desc' ].append( names[colNum] )

                    #sys.stdout.flush()
                    
                    if has_gap:
                        continue

                    if is_het:
                        continue
                    
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
427
428
                    if len(register['desc' ]) > 0:
                        #print '+\n'
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
429
430
431
432
433
434
435
436
437
438
439
440
441
                        
                        if translation:
                            register['chrom'] = translation.get(register['chrom'], register['chrom'])
                        
                        descs = deepcopy(register['desc' ])
                        for desc in descs:
                            register['desc' ] = '|'.join(descs[desc])

                            if len(set(desc)) == 1:
                                desc = desc[0]

                            register['dst'  ] = ','.join(sorted(list(set(desc))))
                            vcf_holder.printRegister(register)
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
442
443
444
445
446
447
448

                    else:
                        #print '-'
                        pass

            fhdv.flush()

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
449
450
451
        print "GLOBAL STATS"
        print vcf_holder.stats

Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
452
453
454
        os.rename(outFileTmp, outFile)

if __name__ == '__main__':
Aflitos, Saulo Alves's avatar
Aflitos, Saulo Alves committed
455
    main(sys.argv[1:])