Commit 457db65e authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

sync

parent e29354cf
"""
Author: Jasper Koehorst
Script to build genome properties output from an HDT file
"""
import os
import re
import sys
from rdflib import Graph
from rdflib_hdt import optimize_sparql, HDTStore
def export_tsv(hdt_file):
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
graph = Graph(store=HDTStore(hdt_file))
samplesResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
?sample a gbol:Sample .
?sample gbol:name ?samplename .
}""")
if not os.path.exists('prot_tsv/'):
os.makedirs('prot_tsv', exist_ok=True)
# subprocess.check_output('mkdir prot_tsv', shell=True)
# Multiple samples possible
sample_store = {}
for sampleRow in samplesResults:
samplename = f"{sampleRow.samplename}"
print('%s' % samplename)
sampleIRI = (f"{sampleRow.sample}")
interproResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<""" + sampleIRI + """>}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}""")
data = []
for interproRow in interproResults:
gene = f"{interproRow.gene}"
genename = re.sub(r'gene/.*', '', gene)
strand = f"{interproRow.strand}"
strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
gbeginpos = f"{interproRow.gbeginpos}"
gendpos = f"{interproRow.gendpos}"
cdsbeginpos = f"{interproRow.cdsbeginpos}"
cdsendpos = f"{interproRow.cdsendpos}"
featurebeginpos = f"{interproRow.featurebeginpos}"
featureendpos = f"{interproRow.featureendpos}"
acc = f"{interproRow.acc}"
singnaturedes = f"{interproRow.singnaturedes}"
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
sample_store[samplename] = data
for sample_name in sample_store:
outFileName = 'prot_tsv/' + sample_name + ".tsv"
outFile = open(outFileName, 'w')
for element in sample_store[sample_name]:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
return outFileName
def main(hdt_file):
prot_tsv_file = export_tsv(hdt_file)
print(prot_tsv_file)
if __name__ == "__main__":
hdt_file = "sys.argv[1]
main(hdt_file)
......@@ -8,6 +8,7 @@ import os, subprocess, sys, re
from rdflib import Graph
from rdflib_hdt import HDTStore, optimize_sparql
def main(hdtfile):
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
......@@ -24,7 +25,7 @@ def main(hdtfile):
""")
if not os.path.exists('prot_tsv/'):
os.makedirs('prot_tsv',exist_ok=True)
os.makedirs('prot_tsv', exist_ok=True)
# subprocess.check_output('mkdir prot_tsv', shell=True)
# Multiple samples possible
......@@ -91,11 +92,12 @@ def main(hdtfile):
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
sample_store[samplename] = data
return sample_store
if __name__ == "__main__":
sample_store = main(sys.argv[1])
for sample_name in sample_store:
......@@ -105,5 +107,3 @@ if __name__ == "__main__":
for element in sample_store[sample_name]:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
......@@ -33,7 +33,6 @@ def main(hdt_file):
for result_sample in results_sample:
samplename = f"{result_sample.samplename}"
print(samplename)
# outFileName = 'prot_tsv_nostrand/' + samplename + ".tsv"
# outFile = open(outFileName, 'w')
......
......@@ -26,7 +26,8 @@ def main(input_file, kmer_size):
#open method will open a file on your system and write the contents
with open('genomeProperties.txt', 'wb') as f:
f.write(req.content)
print("Loading genome properties")
content = open('genomeProperties.txt').readlines()
tree = parse_genome_properties_flat_file(content)
......@@ -38,8 +39,8 @@ def main(input_file, kmer_size):
i = 0
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for index, row in enumerate(tsv_reader):
if index % 100 == 0:
print (index)
# if index % 100 == 0:
# print (index)
current_strand = row[2]
if current_strand != strand:
strand = current_strand
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment