Commit d838944b authored by Nijsse, Bart's avatar Nijsse, Bart
Browse files

changed to rdflib and rdflib-hdt to read from HDT file directly instead of endpoint

parent 25ddb525
#!/usr/bin/env python3
"""
Author: Wasin
Author: Wasin Poncheewin, Bart Nijsse, Jasper Koehorst
Script to build protein tsv file with all info
"""
import os, subprocess, sys, re
from rdflib import Graph
from rdflib_hdt import HDTStore, optimize_sparql
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import os
import subprocess
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
sparql = SPARQLWrapper("http://nvme1.wurnet.nl:7201/repositories/pseudomonasDB2")
sparql.setQuery("""
hdtfile = sys.argv[1]
graph = Graph(store=HDTStore(sys.argv[1]))
samplesResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
......@@ -19,76 +22,79 @@ sparql.setQuery("""
?sample gbol:name ?samplename .
}
""")
sparql.setReturnFormat(JSON)
results_sample = sparql.query().convert()
if not os.path.exists('prot_tsv/'):
subprocess.check_output('mkdir prot_tsv', shell=True)
for result_sample in results_sample["results"]["bindings"]:
print('%s' % (result_sample["samplename"]["value"]))
outFileName = 'prot_tsv/' + result_sample["samplename"]["value"] + ".tsv"
for sampleRow in samplesResults:
samplename = f"{sampleRow.samplename}"
print('%s' % samplename)
outFileName = 'prot_tsv/' + samplename + ".tsv"
outFile = open(outFileName, 'w')
sampleIRI = result_sample["sample"]["value"]
sparql.setQuery("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<"""+sampleIRI+""">}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
sparql.setReturnFormat(JSON)
results_interpro = sparql.query().convert()
sampleIRI = (f"{sampleRow.sample}")
interproResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<""" + sampleIRI + """>}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
data = []
for result_interpro in results_interpro["results"]["bindings"]:
samplename = result_sample["samplename"]["value"]
gene = result_interpro["gene"]["value"]
for interproRow in interproResults:
gene = f"{interproRow.gene}"
genename = re.sub(r'gene/.*', '', gene)
strand = result_interpro["strand"]["value"]
strand = f"{interproRow.strand}"
strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
gbeginpos = result_interpro["gbeginpos"]["value"]
gendpos = result_interpro["gendpos"]["value"]
cdsbeginpos = result_interpro["cdsbeginpos"]["value"]
cdsendpos = result_interpro["cdsendpos"]["value"]
featurebeginpos = result_interpro["featurebeginpos"]["value"]
featureendpos = result_interpro["featureendpos"]["value"]
acc = result_interpro["acc"]["value"]
singnaturedes = result_interpro["singnaturedes"]["value"]
dbname = result_interpro["dbname"]["value"]
evalue = result_interpro["evalue"]["value"]
data += [(samplename,genename,strandname,int(gbeginpos),int(gendpos),int(cdsbeginpos),int(cdsendpos),int(featurebeginpos),int(featureendpos),acc,singnaturedes,dbname,float(evalue))]
gbeginpos = f"{interproRow.gbeginpos}"
gendpos = f"{interproRow.gendpos}"
cdsbeginpos = f"{interproRow.cdsbeginpos}"
cdsendpos = f"{interproRow.cdsendpos}"
featurebeginpos = f"{interproRow.featurebeginpos}"
featureendpos = f"{interproRow.featureendpos}"
acc = f"{interproRow.acc}"
singnaturedes = f"{interproRow.singnaturedes}"
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
for element in data:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment