Commit dc04c981 authored by Aflitos, Saulo Alves's avatar Aflitos, Saulo Alves
Browse files

added unicode support; fix bug to species names containing slash; fixed...

added unicode support; fix bug to species names containing slash; fixed vcf_walk.py bug on calling cluster; changed csv separator to | so support XML encoded unicode species names; allow renaming of chromosomes in vcfmerger.py; create assertions to verify if vcf and gff contains same chromosome
parent ccd41803
......@@ -847,14 +847,16 @@ def get_cluster(db_name, ref, chrom):
@app.route("/api/data/<db_name>/<ref>/<chrom>", methods=['GET'])
def get_data(db_name, ref, chrom):
print "get data"
print "get data", db_name, ref, chrom
man = getManager( db_name )
if man is None:
print "no such manager"
abort(404)
ref = urllib.unquote(ref)
ref = urllib.unquote(urllib.unquote(urllib.unquote(ref)))
print "get data", db_name, ref, chrom
data = check_get_data(man, ref, chrom, request)
......@@ -1063,8 +1065,9 @@ def get_clusters_raw( man ):
def check_get_data(man, ref, chrom, request):
ref = unicode(ref)
if ref not in man.getSpps():
print "no such species %s" % ref
print "no such species %s. possible are: %s" % (ref, ", ".join(man.getSpps()))
abort(404)
if chrom not in man.getChroms():
......
......@@ -155,7 +155,7 @@ var mainController = function ( $scope, $http, mySharedService ) {
.success(
function(data) {
console.log('spps %o', data);
$scope.species = data.spps;
$scope.species = list_to_utf(data.spps);
}
);
};
......@@ -281,9 +281,11 @@ var mainController = function ( $scope, $http, mySharedService ) {
$scope.updateSizes();
console.log('REQUESTING DATA');
console.log('REQUESTING DATA', $scope.databaseQry, $scope.specieQry, $scope.chromosomeQry);
//console.log(encodeURIComponent($scope.specieQry), encodeURIComponent(encodeURIComponent($scope.specieQry)));
var q = encodeURIComponent(escape(encodeURIComponent(escape($scope.specieQry))));
$http.get('api/data/'+$scope.databaseQry+'/'+encodeURIComponent(encodeURIComponent($scope.specieQry))+'/'+$scope.chromosomeQry)
$http.get('api/data/'+$scope.databaseQry+'/'+q+'/'+$scope.chromosomeQry)
.success( $scope.receiveData )
.error( $scope.receiveDataError );
......@@ -2588,7 +2590,16 @@ var mainController = function ( $scope, $http, mySharedService ) {
};
function numeric_to_unicode(s) {
return s.replace(/&#(\d+);/g, function (m, n) { return String.fromCharCode(n); });
}
function list_to_utf(lst) {
return lst;
var l = $.map( lst, numeric_to_unicode );
//console.log('list_to_utf', lst, l);
return l;
}
......@@ -2818,7 +2829,7 @@ function loadReport(data) {
var gene = data['gene' ];
var chrom = data['chrom' ];
var spps = data['spps' ];
var spps = list_to_utf(data['spps' ]);
var matrix = data['LINE' ];
var treeStr = data['TREE' ].newick;
......
......@@ -34,11 +34,11 @@ def main(args):
parser.add_argument('-t' , '--thr' , '--threads' , dest='threads' , default=2 , nargs='?' , type=int, help='[optional] number of threads. [default: 5]')
parser.add_argument('-e' , '--ext' , '--extension', dest='extension', default='.matrix', nargs='?' , type=str, help='[optional] extension to search. [default: .matrix]')
parser.add_argument('-p' , '--nopng' , dest='dopng' , action='store_false', help='do not export png')
parser.add_argument('-s' , '--nosvg' , dest='dosvg' , action='store_false', help='do not export svg')
parser.add_argument('-n' , '--notree', dest='dotree' , action='store_false', help='do not export tree. precludes no png and no svg')
parser.add_argument('-r' , '--norows', dest='dorows' , action='store_false', help='do not export rows')
parser.add_argument('-c' , '--nocols', dest='docols' , action='store_false', help='do not export cols')
parser.add_argument('-p' , '--nopng' , dest='dopng' , action='store_false', help='do not export png')
parser.add_argument('-s' , '--nosvg' , dest='dosvg' , action='store_false', help='do not export svg')
parser.add_argument('-n' , '--notree', dest='dotree' , action='store_false', help='do not export tree. precludes no png and no svg')
parser.add_argument('-r' , '--norows', dest='dorows' , action='store_false', help='do not export rows')
parser.add_argument('-c' , '--nocols', dest='docols' , action='store_false', help='do not export cols')
parser.add_argument('infiles' , nargs=argparse.REMAINDER, help='[optional] input files')
......@@ -59,7 +59,7 @@ def main(args):
numfiles = len(infiles)
print options
print "Cluster Options", options
if numfiles == 0:
......
#!/usr/bin/python
import os
import sys
import csv
import re
import unicodedata
from unidecode import unidecode
def main():
try:
inlst = sys.argv[1]
intbl = sys.argv[2]
tbl_k = sys.argv[3]
tbl_vs = sys.argv[4].split(',')
except:
print "<inlist> <in table> <column key name> <column val names>"
sys.exit(1)
print "input list %s" % inlst
print "input table %s" % intbl
print "table column key name %s" % tbl_k
print "table column val names %s" % tbl_vs
data = {}
atad = {}
col_names = None
tbl_k_id = None
tbl_v_ids = None
with open(intbl, 'rb') as fhd:
reader = csv.reader(fhd, delimiter=',', quotechar='"')
for ln, cols in enumerate(reader):
#print "ln", ln, "cols", cols
if len(cols) == 0:
continue
if ln == 0: #header
print cols
assert tbl_k in cols, "key %s not in header %s" % (tbl_k, ", ".join(cols))
tbl_k_id = cols.index(tbl_k)
tbl_v_ids = []
for tbl_v in tbl_vs:
assert tbl_v in cols, "value %s not in header %s" % (tbl_v, ", ".join(cols))
tbl_v_ids.append( cols.index(tbl_v) )
else:
k = cols[tbl_k_id]
vs = []
for tbl_v_id in tbl_v_ids:
try:
c = cols[tbl_v_id]
if c not in vs:
for s in vs:
if s in c:
c = c.replace(s, '')
if c in s:
c = c.replace(c, '')
vs.append( c )
except:
print "id", tbl_v_id, "cols", ", ".join(cols), "len", len(cols)
sys.exit(1)
#k = unidecode(k)#unicodedata.normalize('NFKD', unidecode(k)).encode('ascii','ignore')
#vs = [ unidecode(v) for v in vs ]
v = "_".join(vs)
k = sanitize(k, ' -.,:()=#&;')
v = sanitize(v, ' -.,:()=#&;')
assert k not in data, "key %s found more than once" % ( k )
assert v not in atad, "val %s found more than once" % ( v )
data[k] = v
atad[v] = k
print "data"
for k,v in data.iteritems():
print " %-7s %s" % (k, v)
with open(inlst, 'rb') as fhdi:
reader = csv.reader(fhdi, delimiter='\t', quotechar='"')
with open(inlst + '.renamed.csv', 'wb') as fhdo:
writer = csv.writer(fhdo, delimiter='\t', quotechar='"')
for ln, cols in enumerate(reader):
if len(cols) == 0:
continue
if cols[0][0] == "#":
continue
assert len(cols) == 3, "wrong number of columns: %d %d %s" % (len(cols), ln, str(cols))
name = cols[2]
assert name in data, "name %s not in db" % name
cols[2] = data[name]
writer.writerow(cols)
def sanitize(s, k, v="_"):
for r in k:
s = s.replace(r, v)
s = re.sub(v+'+', v, s)
s = s.strip(v)
s = s.decode('utf8').encode('ascii', 'xmlcharrefreplace')
return s
if __name__ == '__main__':
main()
......@@ -6,6 +6,8 @@ from sqlalchemy.orm import sessionmaker
#from sqlalchemy import select, create_engine, Column, Integer, String, Float, ForeignKey, PickleType, LargeBinary, Sequence, and_
from sqlalchemy import create_engine, Column, Integer, String, LargeBinary, and_
from HTMLParser import HTMLParser
htmlparser = HTMLParser()
Base = declarative_base()
chromNameLength = 1024
......@@ -96,7 +98,7 @@ class spp_db(Base):
return "<SPP('*%s', '%d')>" % ( self.spp, self.pos )
def raw(self):
return { self.spp: self.pos }
return { htmlparser.unescape(self.spp): self.pos }
class fasta_db(Base):
......
......@@ -131,7 +131,9 @@ def openvcffile(infile, method, **kwargs):
"""
Open vcf file checking for extension and handling compression
"""
fhd = openfile(infile, method, **kwargs)
if infile.endswith('.vcf.gz'):
pass
......@@ -141,6 +143,7 @@ def openvcffile(infile, method, **kwargs):
else:
print "unknown file type"
sys.exit(1)
return fhd
......
......@@ -5,6 +5,7 @@ import sys
import string
ignores = ['0/0', './.'] # reference, nocov
ignores = ['0|0', '0/0', './.'] # reference, nocov
valid_chars = frozenset("_%s%s" % (string.ascii_letters, string.digits))
def sanitize(name):
......@@ -30,6 +31,7 @@ def main():
defs = []
names = []
outfiles = []
lastCol = ""
num_cols = None
with open(infile) as fhd:
for line in fhd:
......@@ -74,10 +76,16 @@ def main():
continue
#print "DATA", line
cols = line.split("\t")
cols = line.split("\t")
assert len(cols) == num_cols
shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA
data = cols[9:]
shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA
shared_str = "\t".join(shared) + "\t"
data = cols[9:]
if cols[0] != lastCol:
print cols[0]
lastCol = cols[0]
#print "shared", shared
#print "data" , data
for pos, ndata in enumerate(data):
......@@ -86,7 +94,7 @@ def main():
outfiles[pos][3] += 1 # skipped
continue
outfiles[pos][4] += 1 # valid
outfiles[pos][2].write("\t".join(shared) + "\t%s\n" % ndata)
outfiles[pos][2].write(shared_str + "\t" + ndata + "\n")
for nop, ndata in enumerate(outfiles):
ndata[2].close()
......
......@@ -776,9 +776,9 @@ class walker(object):
if not cluster_dotree:
clusterParams.append( '--notree' )
if not cluster_dorows:
clusterParams.append( ' --norows' )
clusterParams.append( '--norows' )
if not cluster_docols:
clusterParams.append( ' --nocols' )
clusterParams.append( '--nocols' )
if cluster_extension:
clusterParams.extend( ['--extension', cluster_extension] )
if cluster_threads:
......@@ -933,6 +933,9 @@ class walker(object):
cparams = [ '--output', cluster_name, '--indir', chromPath ]
cparams.extend( clusterParams )
print "walking chromosome: %s :: cluster : %s . CREATING : PARAMS %s" % (chromosome, cluster_name, str(cparams))
cluster.main( cparams )
print "walking chromosome: %s :: cluster : %s . CREATED" % (chromosome, cluster_name)
......@@ -955,6 +958,9 @@ class walker(object):
cparams = [ '--output', cluster_name, '--indir', self.db_name ]
cparams.extend( clusterParams )
print "walking chromosome: %s :: cluster : %s . CREATING : PARAMS %s" % ('__global__', cluster_name, str(cparams))
cluster.main( cparams )
print "walking chromosome: %s :: cluster : %s . CREATED" % ('__global__', cluster_name)
......@@ -1380,7 +1386,7 @@ def process_matrix( chromosome, bn, chromPath, matrix_name ):
with open( coord_path, 'r' ) as fhd:
lines = fhd.read().split("\n")
coordsNum = int(lines[0])
coords = [ int(x) for x in lines[1].split(',') ]
coords = [ int(x) for x in lines[1].split('|') ]
if coordsNum != len(coords):
print "length of coords %s differs from number of coords %d" % (coordsNum, len(coords))
......
......@@ -240,7 +240,7 @@ def readSources(config):
if line[0] == '#':
header += line + "\n"
if line.startswith('##sources='):
names = line[10:].split(';')
names = line[10:].split('|')
for name in names:
sources[name] = []
#pp(sources)
......@@ -334,7 +334,7 @@ def readData(config):
posi = int(cols[1])
src = cols[3]
dst = cols[4]
spps = cols[9].split(",")
spps = cols[9].split("|")
......@@ -1032,7 +1032,7 @@ def printfilecoords(config, chro):
with open(outfile, 'w') as fhd:
fhd.write( str( len( coords ) ) )
fhd.write( "\n" )
fhd.write( ",".join( [ str(x) for x in sorted(coords) ] ) )
fhd.write( "|".join( [ str(x) for x in sorted(coords) ] ) )
......
......@@ -80,7 +80,9 @@ def main(args):
config['negative' ] = options.negative
config['knife' ] = options.knife
verbose = options.verbose
verbose = options.verbose
if not config['outfolder']:
config['outfolder'] = 'walk_out'
......@@ -114,31 +116,42 @@ def main(args):
config['idx'] = filemanager.readIndex(indexFile)
if config['ingff'] is not None:
"""
Creates a gff reader instance
"""
config['ingffreader'] = gffReader( config['ingff'] , negative=config['negative'], protein=config['protein'], verbose=verbose)
print "IDX", config['idx']
print "GFF", config['ingffreader'].index
assert set(config['idx'].keys()) >= set(config['ingffreader'].index.keys()), "VCF chromosomes (%s) are not a subset from GFF (%s)" % (", ".join(config['idx'].keys()), ", ".join(config['ingffreader'].index.keys()) )
if config['inchr'] is not None:
"""
If a specific chromosome has been requeste, get file position of the chromosome
"""
config['insekpos'] = config['idx'][config['inchr']]
"""
Creates a VCFCONCAT instance
"""
vcfconcat.readSources(config)
"""
Read the data and filter
"""
readData(config, verbose=verbose)
with open(config['outfolder']+'.ok', 'w') as fhd:
fhd.write(str(config))
return config
......@@ -149,12 +162,14 @@ def readData(config, verbose=False):
print "reading data"
"""
Creates a VCFMERGER instance to read the VCF file
"""
config['infhd'] = filemanager.openvcffile(config['infile'], 'r')
if config['outfile'] is None:
"""
Creates output file filename if not defined
......@@ -177,6 +192,8 @@ def readData(config, verbose=False):
else:
config['outfn'] = config['outfile']
"""
Creates a VCFMERGER to save the output
"""
......@@ -197,6 +214,8 @@ def readData(config, verbose=False):
"""
print "reading data :: has idx :: seeking chrom %s" % config['inchr']
assert config['inchr'] in config['idx'], "requested chromosome %s not in vcf file: %s" % ( config['inchr'], config['idx'].keys() )
config['infhd'].seek( config['insekpos'] )
runName = config['inchr']
......@@ -266,13 +285,16 @@ def readData(config, verbose=False):
#print cols
if lastChro != chro:
print chro
print "Chromosome", chro
assert chro in config['idx'], "chromosome %s not in vcf file: %s" % ( chro, config['idx'].keys() )
if lastChro is not None:
if inchr is not None:
if inchr != lastChro:
print "reading data :: %s :: %s :: skipping exporting" % (runName, lastChro)
lastChro = chro
continue
else:
finalChro = True
......@@ -300,7 +322,7 @@ def readData(config, verbose=False):
if ( inchr is not None ):
"""
If a specific chromosome mas requested
If a specific chromosome was requested
"""
if ( inchr != chro ) :
......@@ -474,11 +496,15 @@ def readData(config, verbose=False):
keys = sorted(list(set(valids.keys() + numSnps.keys())))
for chro in keys:
vals = 0
if chro in valids:
vals = valids[chro]
tot = 0
if chro in numSnps:
tot = numSnps[chro]
print " CHROM %s TOTAL %12d VALID %12d" % ( chro, tot, vals )
return 0
......
......@@ -153,14 +153,14 @@ class vcfResult(object):
print_every = 1000
linelen = 100
def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, noncarefiles=[]):
def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, noncarefiles=[], translation=None ):
self.result = None
self.chom = None
self.pos = None
self.chrCount = None
self.posCount = None
self.simplify = simplify
self.translation = translation
if vcfResult.simpliStats is None:
vcfResult.simpliStats = {
......@@ -183,6 +183,7 @@ class vcfResult(object):
vcfResult.excludHET = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_HETEROZYGOUS , 2 ) ) ] # exclude heterozygous
vcfResult.excludeINDEL = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_INDEL , 2 ) ) ] # exclude indels (len > 1)
vcfResult.excludeSingleton = vcfResult.simplifybits[ int( math.log( SIMP_EXCL_SINGLETON , 2 ) ) ] # exclude single species SNPs
print "simplifying :: SNP [%3d, %3d] %s" % ( SIMP_SNP , math.log(SIMP_SNP , 2 ), str(vcfResult.simplifySNP ) )
print "simplifying :: Heterozygous [%3d, %3d] %s" % ( SIMP_EXCL_HETEROZYGOUS, math.log(SIMP_EXCL_HETEROZYGOUS , 2 ), str(vcfResult.excludHET ) )
print "simplifying :: Indel [%3d, %3d] %s" % ( SIMP_EXCL_INDEL , math.log(SIMP_EXCL_INDEL , 2 ), str(vcfResult.excludeINDEL ) )
......@@ -228,8 +229,8 @@ class vcfResult(object):
for dst in dsts:
files = dsts[ dst ]
if dst.find(',') != -1:
for dstl in dst.split(','):
if dst.find('|') != -1:
for dstl in dst.split('|'):
if dstl not in simplsrc: simplsrc[dstl] = []
simplsrc[dstl].extend(files)
else:
......@@ -330,8 +331,8 @@ class vcfResult(object):
continue
if ( vcfResult.excludeINDEL ):
if desti.find(',') != -1: # is heterozygous
destis = desti.split(',')
if desti.find('|') != -1: # is heterozygous
destis = desti.split('|')
for alt in destis:
if len(alt) > 1:
vcfResult.simpliStats['Heterozygous Indel'] += 1
......@@ -367,7 +368,7 @@ class vcfResult(object):
files = dsts[ desti ]
if vcfResult.simplifySNP:
files.extend( descr.split(',') )
files.extend( descr.split('|') )
else:
if filedesc in files:
......@@ -421,11 +422,16 @@ class vcfResult(object):
self.printprogress('2', key='Singleton 2', skip=vcfResult.print_every)
continue
files_str = ",".join( files )
files_str = "|".join( files )
info_str = "NV=%d;NW=%d;NS=%d;NT=%d;NU=%d" % ( nv, nw, ns, nt, nu )
# chr pos id ref alt qual filter info FORMAT filenames
restr += "\t".join([ chrom, str(posit), '.', src, dst, '.', 'PASS', info_str, 'FI', files_str]) + "\n"
chrom_name = chrom
if self.translation:
if chrom in self.translation:
chrom_name = self.translation[ chrom_name ]
restr += "\t".join([ chrom_name, str(posit), '.', src, dst, '.', 'PASS', info_str, 'FI', files_str]) + "\n"
if len(restr) == 0:
vcfResult.simpliStats['Singleton 3'] += 1
......@@ -529,7 +535,7 @@ class vcfFile(object):
elif currLine[0] == "#":
if currLine.startswith('##sources='):
self.names = currLine[10:].split(';')
self.names = currLine[10:].split('|')
pp(self.names)
elif currLine.startswith('##numsources='):
......@@ -589,7 +595,7 @@ class vcfFile(object):
#print " gt2", gtinfo[-1]
#print " adding src to dst", self.register['src' ], self.register['dst' ]
self.register['dst' ] = ",".join(sorted(list(set([self.register['src' ]] + self.register['dst' ].split(",")))))
self.register['dst' ] = "|".join(sorted(list(set([self.register['src' ]] + self.register['dst' ].split("|")))))
#print " added src to dst", self.register['dst' ]
else:
......@@ -639,7 +645,11 @@ class vcfHeap(object):
Maintain the latest line in all VCF files at the same time
"""
def __init__(self, simplify=SIMP_NO_SIMPLIFICATION):
def __init__(self, simplify=SIMP_NO_SIMPLIFICATION, translation=None):
self.translation = translation
print "VCF Heap Translation", translation
self.fileNames = [] #vecString fileNames;
self.filehandle = [] #vecIfstream filehandle;
self.fileStates = [] #vecBool fileStates;
......@@ -657,7 +667,7 @@ class vcfHeap(object):
}
self.simplify = simplify
self.noncarefiles = []
self.currResult = vcfResult( simplify=self.simplify, noncarefiles=self.noncarefiles )
self.currResult = vcfResult( simplify=self.simplify, noncarefiles=self.noncarefiles, translation=translation )
self.currResult = None
print self.ctime
......@@ -765,7 +775,7 @@ class vcfHeap(object):
"""