Commit d228c226 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

vcf multicolumn

parent 73bb4780
......@@ -1279,6 +1279,58 @@ def getManager(db_name):
def load_database():
if app.config['DEBUG']:
print "IN DEBUG MODE. MAIN TREAD: ", (os.environ.get('WERKZEUG_RUN_MAIN') is None)
if os.environ.get('WERKZEUG_RUN_MAIN') is None:
print " STILL RELOADING. NOT LOADING DATABASE"
return
else:
print " HAS RELOADED. LOADING DATABASE"
print "GLOBING", app.config["INFOLDER"]
files = glob.glob( os.path.join( app.config["INFOLDER"], '*.pickle.gz') )
files.extend( glob.glob( os.path.join( app.config["INFOLDER"], '*.sqlite' ) ) )
#print "GLOBBED", files
files.sort()
if "DATABASES" not in app.config:
app.config["DATABASES" ] = []
for db_name in files:
db_nfo = db_name + ".nfo"
db_name_base = db_name
if db_name.endswith( '.sqlite' ):
db_name_base = db_name[:-7]
elif db_name.endswith( '.pickle.gz' ):
db_name_base = db_name[:-10]
db_title = db_name_base
db_conf = {}
if os.path.exists( db_nfo ):
db_title, db_conf = read_nfo( db_title, db_nfo, path=app.config["INFOLDER"] )
if len( db_title ) == 0:
print "db %s has no title. using file name", db_name
db_title = db_name_base
else:
#print "database %s has no nfo file (%s), skipping" % (db_name, db_nfo)
print "database %s has no nfo file (%s). skipping" % (db_name, db_nfo)
continue
app.config["DATABASES" ].append( [db_title, db_name_base, db_conf] )
print "appending database %s (base %s) with description '%s'" % (db_name, db_name_base, db_title)
app.config["DATABASES" ].sort(key=itemgetter(0))
print app.config["DATABASES" ]
init_db()
def init_db():
"""
Load global/shared database
......@@ -1290,6 +1342,7 @@ def init_db():
If no database defined in config.py. quit
"""
print '!!!!!!!!! NO DATABASES GIVEN. PLEASE ADD DATABASES !!!!!!!!!'
sys.exit(1)
else:
with app.app_context():
......@@ -1326,51 +1379,6 @@ def init_db():
print "db loaded"
def load_database():
print "GLOBING", app.config["INFOLDER"]
files = glob.glob( os.path.join( app.config["INFOLDER"], '*.pickle.gz') )
files.extend( glob.glob( os.path.join( app.config["INFOLDER"], '*.sqlite' ) ) )
#print "GLOBBED", files
files.sort()
if "DATABASES" not in app.config:
app.config["DATABASES" ] = []
for db_name in files:
db_nfo = db_name + ".nfo"
db_name_base = db_name
if db_name.endswith( '.sqlite' ):
db_name_base = db_name[:-7]
elif db_name.endswith( '.pickle.gz' ):
db_name_base = db_name[:-10]
db_title = db_name_base
db_conf = {}
if os.path.exists( db_nfo ):
db_title, db_conf = read_nfo( db_title, db_nfo, path=app.config["INFOLDER"] )
if len( db_title ) == 0:
print "db %s has no title. using file name", db_name
db_title = db_name_base
else:
#print "database %s has no nfo file (%s), skipping" % (db_name, db_nfo)
print "database %s has no nfo file (%s). skipping" % (db_name, db_nfo)
continue
app.config["DATABASES" ].append( [db_title, db_name_base, db_conf] )
print "appending database %s (base %s) with description '%s'" % (db_name, db_name_base, db_title)
app.config["DATABASES" ].sort(key=itemgetter(0))
print app.config["DATABASES" ]
init_db()
def read_nfo( db_title, db_nfo, path='.' ):
print "reading nfo %s title %s" % ( db_nfo, db_title )
title = db_title
......
......@@ -72,7 +72,7 @@ window.onload = function () {
var username = document.getElementById('username').value;
//console.log("username : ", username);
get_json( "/salt/"+username, function(data) {
get_json( "salt/"+username, function(data) {
console.log("salt ", data);
var salt = data.salt;
......
#!/usr/bin/python
import os
import sys
import csv
import re
import unicodedata
from unidecode import unidecode
from filemanager import checkfile, openfile
"""
EX1=vcfmerger/csv_list_multicolumn.py
EX2=vcfmerger/csv_renamer.py
VCF=1001genomes_snp-short-indel_only_ACGTN.vcf.gz
LST=A_thaliana_master_accession_list_1135_20151008.csv
${EX1} ${VCF}
${EX2} ${VCF}.list.csv ${LST} tg_ecotypeid name,othername,CS_number
"""
def main():
try:
invcf = sys.argv[1]
except:
print "<invcf>"
print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz"
sys.exit(1)
print "input vcf %s" % invcf
checkfile(invcf)
names = None
with openfile(invcf, 'r') as fhdi:
with open(invcf + '.list.csv', 'wb') as fhdo:
writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
for line in fhdi:
line = line.strip()
if len(line) == 0:
continue
if line.startswith("#"): # header
print "HEADER", line
if line.startswith("##"): # definition lines
print "HEADER :: DEF", line
else: # column description
print "HEADER :: COL", line
cols = line.split("\t")
num_cols = len(cols)
shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
names = cols[9:]
print "HEADER :: COL :: SHARED", shared
print "HEADER :: COL :: NAMES" , names
for ln, name in enumerate(names):
cols = ["1", "%s|%d" % (invcf, ln+1), name]
writer.writerow(cols)
break
if __name__ == '__main__':
main()
......@@ -7,27 +7,23 @@ import re
import unicodedata
from unidecode import unidecode
def main():
try:
inlst = sys.argv[1]
intbl = sys.argv[2]
tbl_k = sys.argv[3]
tbl_vs = sys.argv[4].split(',')
"""
EX1=vcfmerger/csv_list_multicolumn.py
EX2=vcfmerger/csv_renamer.py
VCF=1001genomes_snp-short-indel_only_ACGTN.vcf.gz
LST=A_thaliana_master_accession_list_1135_20151008.csv
except:
print "<inlist> <in table> <column key name> <column val names>"
sys.exit(1)
print "input list %s" % inlst
print "input table %s" % intbl
print "table column key name %s" % tbl_k
print "table column val names %s" % tbl_vs
${EX1} ${VCF}
${EX2} ${VCF}.list.csv ${LST} tg_ecotypeid name,othername,CS_number
"""
def get_translation(intbl, tbl_k, tbl_vs):
data = {}
atad = {}
col_names = None
tbl_k_id = None
tbl_v_ids = None
with open(intbl, 'rb') as fhd:
reader = csv.reader(fhd, delimiter=',', quotechar='"')
for ln, cols in enumerate(reader):
......@@ -78,6 +74,27 @@ def main():
data[k] = v
atad[v] = k
return data, atad
def main():
try:
inlst = sys.argv[1]
intbl = sys.argv[2]
tbl_k = sys.argv[3]
tbl_vs = sys.argv[4].split(',')
except:
print "<inlist> <in table> <column key name> <column val names>"
sys.exit(1)
print "input list %s" % inlst
print "input table %s" % intbl
print "table column key name %s" % tbl_k
print "table column val names %s" % tbl_vs
data, atad = get_translation(intbl, tbl_k, tbl_vs)
print "data"
for k,v in data.iteritems():
print " %-7s %s" % (k, v)
......
......@@ -19,7 +19,8 @@ from filemanager import loads_data
def getsession(db_file, echo=False):
print "CREATING SESSSION"
engine = create_engine('sqlite:///'+db_file, echo=echo )
#http://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-xvii-deployment-on-linux-even-on-the-raspberry-pi
engine = create_engine('sqlite:///'+db_file + '?check_same_thread=False', echo=echo )
Session = sessionmaker(bind=engine)
session = Session()
session._model_changes = {}
......
......@@ -109,6 +109,10 @@ def checkfile(infile):
print "input file %s does not exists" % infile
sys.exit(1)
if os.path.isdir( infile ):
print "input file %s is a folder" % infile
sys.exit(1)
return infile
......
#!/usr/bin/python
#ulimit -n 4000
import os
import sys
import string
from filemanager import checkfile, openfile
ignores = ['0/0', './.'] # reference, nocov
ignores = ['0|0', '0/0', './.'] # reference, nocov
"""
EX1=vcfmerger/split_multicolumn_vcf.py
EX2=vcfmerger/csv_renamer.py
VCF=1001genomes_snp-short-indel_only_ACGTN.vcf.gz
LST=A_thaliana_master_accession_list_1135_20151008.csv
${EX1} ${VCF}
${EX2} ${VCF}.lst ${LST} tg_ecotypeid name,othername,CS_number
"""
valid_chars = frozenset("_%s%s" % (string.ascii_letters, string.digits))
def sanitize(name):
return ''.join(c if c in valid_chars else '_' for c in name)
......@@ -19,21 +32,18 @@ def main():
print sys.argv[0], "<INPUT MULTICOLUMN CSV>"
sys.exit(1)
if not os.path.exists( infile ):
print "input file %s does not exists" % infile
sys.exit(1)
if os.path.isdir( infile ):
print "input file %s is a folder" % infile
sys.exit(1)
checkfile(infile)
print "splitting %s" % infile
defs = []
names = []
outfiles = []
lastCol = ""
num_cols = None
with open(infile) as fhd:
defs = []
names = []
outfiles = []
valid = 0
skipped = 0
lastCol = ""
num_cols = None
line_count = 0
with openfile(infile, 'r') as fhd:
for line in fhd:
line = line.strip()
......@@ -41,26 +51,29 @@ def main():
continue
if line.startswith("#"): # header
#print "HEADER", line
print "HEADER", line
if line.startswith("##"): # definition lines
#print "HEADER :: DEF", line
print "HEADER :: DEF", line
defs.append( line )
else: # column description
#print "HEADER :: COL", line
print "HEADER :: COL", line
cols = line.split("\t")
num_cols = len(cols)
shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA
names = cols[9:]
#print "HEADER :: COL :: SHARED", shared
#print "HEADER :: COL :: NAMES" , names
print "HEADER :: COL :: SHARED", shared
print "HEADER :: COL :: NAMES" , names
outfiles = [None]*len(names)
outlist = open("%s.lst" % infile, 'w')
for np, name in enumerate(names):
nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf") % (infile, np+1, sanitize(name))
nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf.gz") % (infile, np+1, sanitize(name))
print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof)
nop = open( nof, 'w' )
nop = openfile( nof, 'w' )
# skipped valid
outfiles[np] = [name, nof, nop, 0 , 0]
......@@ -75,6 +88,17 @@ def main():
continue
line_count += 1
if line_count % 1000 == 0:
sys.stdout.write('.')
if line_count % 100000 == 0:
sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped) )
for nop, ndata in enumerate(outfiles):
ndata[2].flush()
sys.stdout.flush()
#print "DATA", line
cols = line.split("\t")
assert len(cols) == num_cols
......@@ -83,7 +107,7 @@ def main():
data = cols[9:]
if cols[0] != lastCol:
print cols[0]
print '\nChromosome', cols[0]
lastCol = cols[0]
#print "shared", shared
......@@ -91,8 +115,11 @@ def main():
for pos, ndata in enumerate(data):
#outfiles[np] = [name, nof, 0, 0, nop]
if any([ndata.startswith(x) for x in ignores]):
skipped += 1
outfiles[pos][3] += 1 # skipped
continue
valid += 1
outfiles[pos][4] += 1 # valid
outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")
......
......@@ -126,7 +126,7 @@ def main(args):
print "IDX", config['idx']
print "GFF", config['ingffreader'].index
assert set(config['idx'].keys()) >= set(config['ingffreader'].index.keys()), "VCF chromosomes (%s) are not a subset from GFF (%s)" % (", ".join(config['idx'].keys()), ", ".join(config['ingffreader'].index.keys()) )
assert set(config['idx'].keys()) <= set(config['ingffreader'].index.keys()), "VCF chromosomes (%s) are not a subset from GFF (%s)" % (", ".join(config['idx'].keys()), ", ".join(config['ingffreader'].index.keys()) )
if config['inchr'] is not None:
......
......@@ -14,6 +14,8 @@ from pprint import pprint as pp
sys.path.insert(0, '.')
from filemanager import getFh,openvcffile,openfile,checkfile,makeIndexFile,readIndex
#ulimit -n 4096
#try:
# sys.path.insert(0, 'aux/pypy-2.0/pypy-2.0-beta2/pypy-pypy-4b60269153b5/')
# from rpython.rlib.jit import JitDriver, purefunction
......@@ -104,8 +106,9 @@ Constants
SIMP_NO_SIMPLIFICATION = 0 # no simplification
SIMP_SNP = 2**1 # simplify SNPs
SIMP_EXCL_HETEROZYGOUS = 2**2 # exclude heterozygous
SIMP_EXCL_INDEL = 2**3 # exclude indels
SIMP_EXCL_SINGLETON = 2**4 # exclude singletons
SIMP_EXCL_HOMOZYGOUS = 2**3 # exclude homozygous
SIMP_EXCL_INDEL = 2**4 # exclude indels
SIMP_EXCL_SINGLETON = 2**5 # exclude singletons
def getBits(val):
......@@ -143,6 +146,7 @@ class vcfResult(object):
simplifybits = None
simplifySNP = None
excludeHET = None
excludeHOMO = None
excludeINDEL = None
excludeSingleton = None
noncarefiles = None
......@@ -166,7 +170,7 @@ class vcfResult(object):
vcfResult.simpliStats = {
'Heterozygous Dest' : 0,
'Heterozygous Indel': 0,
'Homozygous Indel' : 0,
'Homozygous' : 0,
'Source Indel' : 0,
'Singleton 1' : 0,
'Singleton 2' : 0,
......@@ -181,17 +185,20 @@ class vcfResult(object):
#print "simplifying :: simplify bits ", self.simplifybits
vcfResult.simplifySNP = vcfResult.simplifybits[ int( math.log( SIMP_SNP , 2 ) ) ] # simplify SNP
vcfResult.excludHET = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_HETEROZYGOUS , 2 ) ) ] # exclude heterozygous
vcfResult.excludHOMO = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_HOMOZYGOUS , 2 ) ) ] # exclude homozygous
vcfResult.excludeINDEL = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_INDEL , 2 ) ) ] # exclude indels (len > 1)
vcfResult.excludeSingleton = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_SINGLETON , 2 ) ) ] # exclude single species SNPs
print "simplifying :: SNP [%3d, %3d] %s" % ( SIMP_SNP , math.log(SIMP_SNP , 2 ), str(vcfResult.simplifySNP ) )
print "simplifying :: Heterozygous [%3d, %3d] %s" % ( SIMP_EXCL_HETEROZYGOUS, math.log(SIMP_EXCL_HETEROZYGOUS , 2 ), str(vcfResult.excludHET ) )
print "simplifying :: Homozygous [%3d, %3d] %s" % ( SIMP_EXCL_HOMOZYGOUS , math.log(SIMP_EXCL_HOMOZYGOUS , 2 ), str(vcfResult.excludHOMO ) )
print "simplifying :: Indel [%3d, %3d] %s" % ( SIMP_EXCL_INDEL , math.log(SIMP_EXCL_INDEL , 2 ), str(vcfResult.excludeINDEL ) )
print "simplifying :: Singleton [%3d, %3d] %s" % ( SIMP_EXCL_SINGLETON , math.log(SIMP_EXCL_SINGLETON , 2 ), str(vcfResult.excludeSingleton) )
print 'Progress Legend:'
print ' print every :', vcfResult.print_every
print ' Heterozygous Dest : h'
print ' Homozygous : o'
print ' Heterozygous Indel: I'
print ' Homozygous Indel : i'
print ' Source Indel : s'
......@@ -267,7 +274,7 @@ class vcfResult(object):
if vcfResult.printsReal % vcfResult.linelen == 0 and vcfResult.printsReal != vcfResult.printsRealLast:
vcfResult.printsRealLast = vcfResult.printsReal
sys.stderr.write(' {:14,d}\n'.format( vcfResult.prints ) )
sys.stderr.write('\nLast {:14,d}\n'.format( vcfResult.prints ) )
sys.stderr.flush()
......@@ -329,9 +336,16 @@ class vcfResult(object):
self.printprogress('h', key='Heterozygous Dest', skip=vcfResult.print_every)
#return ""
continue
if ( vcfResult.excludHOMO ) and ( set(sourc.split(',')) == set(desti.split(',')) ):
vcfResult.simpliStats['Homozygous'] += 1
#print "heretozygous: %s" % desti
self.printprogress('o', key='Homozygous', skip=vcfResult.print_every)
#return ""
continue
if ( vcfResult.excludeINDEL ):
if desti.find('|') != -1: # is heterozygous
if desti.find(',') != -1: # is heterozygous
destis = desti.split('|')
for alt in destis:
if len(alt) > 1:
......@@ -408,6 +422,7 @@ class vcfResult(object):
ntspps = []
for dst in dsts:
ntspps.extend( dsts[ dst ] )
nt = len(set(ntspps))
nw = len(dsts)
......@@ -488,20 +503,21 @@ class vcfFile(object):
Reads a VCF file and keeps the positions until NEXT is called.
Used to facilitate the parallel reading of VCF files where all files have to be in the same coordinate.
"""
def __init__(self, infile, filedesc, filecare):
ignores = ['0|0', '0/0', './.'] # reference, nocov
def __init__(self, infile, filedesc, filecare, fileCol):
self.infile = infile
self.filedesc = filedesc
self.filecare = filecare
self.fileCol = fileCol
infile = checkfile(infile)
fhd = openvcffile(infile, 'r')
checkfile(infile)
#print " opening %s" % infile
self.names = []
self.infhd = fhd
self.state = FHDOPEN
self.currLine = ""
self.register = vcfRegister()
self.names = []
self.infhd = openvcffile(infile, 'r')
self.state = FHDOPEN
self.currLine = ""
self.register = vcfRegister()
self.register['filename'] = infile
self.register['filedesc'] = filedesc
self.register['filecare'] = filecare
......@@ -510,28 +526,46 @@ class vcfFile(object):
"""
Requests the next line in the VCF file as a register
"""
if self.state == FHDOPEN:
while self.state == FHDOPEN:
self.register['chrom'] = None
self.register['pos' ] = None
self.register['src' ] = None
self.register['dst' ] = None
self.register['desc' ] = None
self.register['state'] = self.state
currLine = ""
again = False
while currLine is not None and len(currLine) == 0:
currLine = self.infhd.readline()
if len( currLine ) == 0 or currLine[-1] != "\n": # EOF
currLine = None
currLine = None
again = True
self.state = FHDJUSTCLOSED
return
print 'f',
break
elif len(currLine) == 1: # EMPTY LINE
self.next()
return
again = True
print 'e',
break
else: #normal line
pass
if again:
print 'a',
continue
#0 1 2 3 4 5 6 7 8 9
#SL2.40ch01 2118853 . T G 222 . DP=40;AF1=1;CI95=1,1;DP4=0,0,16,23;MQ=60;FQ=-144 GT:PL:DP:GQ 1/1:255,117,0:39:99
currLine = currLine.strip()
cols = currLine.split("\t")
if len(cols) == 0:
self.next()
return
print 'n',
continue
elif currLine[0] == "#":
if currLine.startswith('##sources='):
......@@ -544,77 +578,94 @@ class vcfFile(object):
print "error parsing sources"
print "num sources", numsources,"!=",len(self.names),sorted(self.names)
sys.exit(1)
else:
print "num sources:",numsources
self.next()
return
try: