Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
UNLOCK
MGnify
Commits
d838944b
Commit
d838944b
authored
Jun 15, 2021
by
Nijsse, Bart
Browse files
changed to rdflib and rdflib-hdt to read from HDT file directly instead of endpoint
parent
25ddb525
Changes
1
Hide whitespace changes
Inline
Side-by-side
GPs/query_export_tsv_sparql.py
View file @
d838944b
#!/usr/bin/env python3
"""
Author: Wasin
Author: Wasin
Poncheewin, Bart Nijsse, Jasper Koehorst
Script to build protein tsv file with all info
"""
import
os
,
subprocess
,
sys
,
re
from
rdflib
import
Graph
from
rdflib_hdt
import
HDTStore
,
optimize_sparql
from
SPARQLWrapper
import
SPARQLWrapper
,
JSON
import
re
import
os
import
subprocess
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql
()
sparql
=
SPARQLWrapper
(
"http://nvme1.wurnet.nl:7201/repositories/pseudomonasDB2"
)
sparql
.
setQuery
(
"""
hdtfile
=
sys
.
argv
[
1
]
graph
=
Graph
(
store
=
HDTStore
(
sys
.
argv
[
1
]))
samplesResults
=
graph
.
query
(
"""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
...
...
@@ -19,76 +22,79 @@ sparql.setQuery("""
?sample gbol:name ?samplename .
}
"""
)
sparql
.
setReturnFormat
(
JSON
)
results_sample
=
sparql
.
query
().
convert
()
if
not
os
.
path
.
exists
(
'prot_tsv/'
):
subprocess
.
check_output
(
'mkdir prot_tsv'
,
shell
=
True
)
for
result_sample
in
results_sample
[
"results"
][
"bindings"
]:
print
(
'%s'
%
(
result_sample
[
"samplename"
][
"value"
]))
outFileName
=
'prot_tsv/'
+
result_sample
[
"samplename"
][
"value"
]
+
".tsv"
for
sampleRow
in
samplesResults
:
samplename
=
f
"
{
sampleRow
.
samplename
}
"
print
(
'%s'
%
samplename
)
outFileName
=
'prot_tsv/'
+
samplename
+
".tsv"
outFile
=
open
(
outFileName
,
'w'
)
sampleIRI
=
result_
sample
[
"
sample
"
][
"value"
]
sparql
.
setQuery
(
"""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<"""
+
sampleIRI
+
""">}
?s gbol:feature ?gene.
?gene gbol:location ?glocation
.
?glocation gbol:begin ?gbegi
n .
?gbegin gbol:positio
n ?gbegin
pos
.
?glocation gbol:end ?gend
.
?gend gbol:position
?gend
pos
.
?glocation gbol:strand ?strand
.
?gene
gbol:tran
script ?mrna
.
?mrna gbol:feature ?cds
.
?cds gbol:location ?cdslocation
.
?cdslocation gbol:begin ?cdsbegi
n .
?cdsbegin gbol:positio
n ?cdsbegin
pos
.
?cdslocation gbol:end ?cdsend
.
?cdsend gbol:position
?cdsend
pos
.
?cds gbol:p
rotein ?prot
.
?prot gbol:feature ?feature
.
?feature
gbol:
sign
ature ?
sign
ature .
?sign
ature gbol:
accession ?acc
.
?signature gbol:
db ?db
.
?db
gbol:
i
d ?db
name
.
?feature gbol:signatureDesc ?singnaturedes
.
?feature gbol:
location ?featurelocation
.
?featurelocatio
n gbol:begi
n ?feature
begi
n .
?feature
begin gbol:positio
n ?featurebegin
pos
.
?feature
location gbol:end
?feature
end
.
?feature
end gbol:position
?featureend
pos
.
?feature gbol:p
rovenance ?provenance
.
?provenance gbol:annotation ?annot
.
?annot gbol:evalue ?evalue
.
}
"""
)
sparql
.
setReturnFormat
(
JSON
)
results_interpro
=
sparql
.
query
().
convert
()
sampleIRI
=
(
f
"
{
sample
Row
.
sample
}
"
)
interproResults
=
graph
.
query
(
"""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<"""
+
sampleIRI
+
""">}
?s gbol:feature ?gene
.
?gene gbol:location ?glocatio
n .
?glocation gbol:begi
n ?gbegin .
?gbegin gbol:position ?gbeginpos
.
?glocation gbol:end
?gend .
?gend gbol:position ?gendpos
.
?glocation
gbol:
s
tran
d ?strand
.
?gene gbol:transcript ?mrna
.
?mrna gbol:feature ?cds
.
?cds gbol:location ?cdslocatio
n .
?cdslocation gbol:begi
n ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos
.
?cdslocation gbol:end
?cdsend .
?cds
end
gbol:p
osition ?cdsendpos
.
?cds gbol:protein ?prot
.
?prot
gbol:
fe
ature ?
fe
ature .
?fe
ature gbol:
signature ?signature
.
?signature gbol:
accession ?acc
.
?signature
gbol:d
b
?db .
?db gbol:id ?dbname
.
?feature gbol:
signatureDesc ?singnaturedes
.
?feature
gbol:
location ?feature
locatio
n .
?feature
location gbol:begi
n ?featurebegin .
?feature
begin gbol:position
?feature
beginpos
.
?feature
location gbol:end
?featureend .
?feature
end
gbol:p
osition ?featureendpos
.
?feature gbol:provenance ?provenance
.
?provenance gbol:annotation ?annot
.
?annot gbol:evalue ?evalue .
}
"""
)
data
=
[]
for
result_interpro
in
results_interpro
[
"results"
][
"bindings"
]:
samplename
=
result_sample
[
"samplename"
][
"value"
]
gene
=
result_interpro
[
"gene"
][
"value"
]
for
interproRow
in
interproResults
:
gene
=
f
"
{
interproRow
.
gene
}
"
genename
=
re
.
sub
(
r
'gene/.*'
,
''
,
gene
)
strand
=
result_
interpro
[
"
strand"
][
"value"
]
strand
=
f
"
{
interpro
Row
.
strand
}
"
strandname
=
re
.
sub
(
r
'http://gbol\.life/0\.1/'
,
''
,
strand
)
gbeginpos
=
result_interpro
[
"gbeginpos"
][
"value"
]
gendpos
=
result_interpro
[
"gendpos"
][
"value"
]
cdsbeginpos
=
result_interpro
[
"cdsbeginpos"
][
"value"
]
cdsendpos
=
result_interpro
[
"cdsendpos"
][
"value"
]
featurebeginpos
=
result_interpro
[
"featurebeginpos"
][
"value"
]
featureendpos
=
result_interpro
[
"featureendpos"
][
"value"
]
acc
=
result_interpro
[
"acc"
][
"value"
]
singnaturedes
=
result_interpro
[
"singnaturedes"
][
"value"
]
dbname
=
result_interpro
[
"dbname"
][
"value"
]
evalue
=
result_interpro
[
"evalue"
][
"value"
]
data
+=
[(
samplename
,
genename
,
strandname
,
int
(
gbeginpos
),
int
(
gendpos
),
int
(
cdsbeginpos
),
int
(
cdsendpos
),
int
(
featurebeginpos
),
int
(
featureendpos
),
acc
,
singnaturedes
,
dbname
,
float
(
evalue
))]
gbeginpos
=
f
"
{
interproRow
.
gbeginpos
}
"
gendpos
=
f
"
{
interproRow
.
gendpos
}
"
cdsbeginpos
=
f
"
{
interproRow
.
cdsbeginpos
}
"
cdsendpos
=
f
"
{
interproRow
.
cdsendpos
}
"
featurebeginpos
=
f
"
{
interproRow
.
featurebeginpos
}
"
featureendpos
=
f
"
{
interproRow
.
featureendpos
}
"
acc
=
f
"
{
interproRow
.
acc
}
"
singnaturedes
=
f
"
{
interproRow
.
singnaturedes
}
"
dbname
=
f
"
{
interproRow
.
dbname
}
"
evalue
=
f
"
{
interproRow
.
evalue
}
"
data
+=
[(
samplename
,
gene
,
strandname
,
int
(
gbeginpos
),
int
(
gendpos
),
int
(
cdsbeginpos
),
int
(
cdsendpos
),
int
(
featurebeginpos
),
int
(
featureendpos
),
acc
,
singnaturedes
,
dbname
,
float
(
evalue
))]
data
.
sort
()
for
element
in
data
:
outFile
.
write
(
'
\t
'
.
join
(
str
(
x
)
for
x
in
element
)
+
'
\n
'
)
outFile
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment