Commit 57ae99d6 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

working on import functionality

parent a0b006cb
......@@ -8,93 +8,101 @@ import os, subprocess, sys, re
from rdflib import Graph
from rdflib_hdt import HDTStore, optimize_sparql
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
def main(hdtfile):
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
hdtfile = sys.argv[1]
graph = Graph(store=HDTStore(sys.argv[1]))
graph = Graph(store=HDTStore(hdtfile))
samplesResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
?s gbol:sample ?sample .
?sample gbol:name ?samplename .
}
""")
samplesResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
?s gbol:sample ?sample .
?sample gbol:name ?samplename .
}
""")
if not os.path.exists('prot_tsv/'):
subprocess.check_output('mkdir prot_tsv', shell=True)
if not os.path.exists('prot_tsv/'):
subprocess.check_output('mkdir prot_tsv', shell=True)
for sampleRow in samplesResults:
samplename = f"{sampleRow.samplename}"
# Multiple samples possible
sample_store = {}
for sampleRow in samplesResults:
samplename = f"{sampleRow.samplename}"
print('%s' % samplename)
outFileName = 'prot_tsv/' + samplename + ".tsv"
outFile = open(outFileName, 'w')
print('%s' % samplename)
sampleIRI = (f"{sampleRow.sample}")
sampleIRI = (f"{sampleRow.sample}")
interproResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<""" + sampleIRI + """>}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
interproResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<""" + sampleIRI + """>}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
data = []
for interproRow in interproResults:
gene = f"{interproRow.gene}"
genename = re.sub(r'gene/.*', '', gene)
strand = f"{interproRow.strand}"
strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
gbeginpos = f"{interproRow.gbeginpos}"
gendpos = f"{interproRow.gendpos}"
cdsbeginpos = f"{interproRow.cdsbeginpos}"
cdsendpos = f"{interproRow.cdsendpos}"
featurebeginpos = f"{interproRow.featurebeginpos}"
featureendpos = f"{interproRow.featureendpos}"
acc = f"{interproRow.acc}"
singnaturedes = f"{interproRow.singnaturedes}"
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
sample_store[samplename] = data
return sample_store
data = []
for interproRow in interproResults:
gene = f"{interproRow.gene}"
genename = re.sub(r'gene/.*', '', gene)
strand = f"{interproRow.strand}"
strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
gbeginpos = f"{interproRow.gbeginpos}"
gendpos = f"{interproRow.gendpos}"
cdsbeginpos = f"{interproRow.cdsbeginpos}"
cdsendpos = f"{interproRow.cdsendpos}"
featurebeginpos = f"{interproRow.featurebeginpos}"
featureendpos = f"{interproRow.featureendpos}"
acc = f"{interproRow.acc}"
singnaturedes = f"{interproRow.singnaturedes}"
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
for element in data:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
if __name__ == "__main__":
sample_store = main(sys.argv[1])
for sample_name in sample_store:
outFileName = 'prot_tsv/' + sample_name + ".tsv"
outFile = open(outFileName, 'w')
for element in sample_store[sample_name]:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
......@@ -16,53 +16,57 @@ from pygenprop.results import GenomePropertiesResults
from pygenprop.database_file_parser import parse_genome_properties_flat_file
from pygenprop.assignment_file_parser import parse_interproscan_file, parse_genome_property_longform_file
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
with requests.Session() as current_download:
response = current_download.get(genome_properties_database_url, stream=True)
tree = parse_genome_properties_flat_file(StringIO(response.text))
if not os.path.exists('results_kmer/'):
subprocess.check_output('mkdir results_kmer', shell=True)
def main():
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
with requests.Session() as current_download:
response = current_download.get(genome_properties_database_url, stream=True)
tree = parse_genome_properties_flat_file(StringIO(response.text))
with open('selected_file_list.txt') as pathogenFile:
for line in pathogenFile:
fileName = line.strip()
print (fileName)
with open('prot_tsv/'+fileName) as interproscan_file:
final_results = {}
identifiers = []
strand = "ForwardStrandPosition"
i = 0
counter = 1
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for row in tsv_reader:
counter += 1
if counter % 1000 == 0:
print (counter)
current_strand = row[2]
if current_strand != strand:
strand = current_strand
identifiers = []
i = 0
matched_interpro_member_database_id = row[9]
identifiers.append(matched_interpro_member_database_id)
i += 1
if i == 20:
assignment_cache = AssignmentCache(interpro_member_database_identifiers=identifiers,
sample_name=splitext(basename(interproscan_file.name))[0])
results = GenomePropertiesResults(assignment_cache, properties_tree=tree)
gp_results_dict = results.property_results.to_dict()[fileName.split('.tsv')[0]]
if not final_results:
final_results = gp_results_dict
else:
for r, f in zip(gp_results_dict.items(), final_results.items()):
if (r[1] == 'PARTIAL' and f[1] == 'NO') or (r[1] == 'YES' and f[1] == 'NO'):
final_results[r[0]] = r[1]
elif (r[1] == 'YES' and f[1] == 'PARTIAL'):
final_results[r[0]] = r[1]
identifiers.pop(0)
i -= 1
if not os.path.exists('results_kmer/'):
subprocess.check_output('mkdir results_kmer', shell=True)
outFileName = "results_kmer/" + fileName.replace(".tsv", ".csv")
pd.DataFrame.from_dict(final_results, orient='index',
columns=[fileName.split('.tsv')[0]]).to_csv(outFileName)
with open('selected_file_list.txt') as pathogenFile:
for line in pathogenFile:
fileName = line.strip()
print (fileName)
with open('prot_tsv/' + fileName) as interproscan_file:
final_results = {}
identifiers = []
strand = "ForwardStrandPosition"
i = 0
counter = 1
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for row in tsv_reader:
counter += 1
if counter % 1000 == 0:
print (counter)
current_strand = row[2]
if current_strand != strand:
strand = current_strand
identifiers = []
i = 0
matched_interpro_member_database_id = row[9]
identifiers.append(matched_interpro_member_database_id)
i += 1
if i == 20:
assignment_cache = AssignmentCache(interpro_member_database_identifiers=identifiers,
sample_name=splitext(basename(interproscan_file.name))[0])
results = GenomePropertiesResults(assignment_cache, properties_tree=tree)
gp_results_dict = results.property_results.to_dict()[fileName.split('.tsv')[0]]
if not final_results:
final_results = gp_results_dict
else:
for r, f in zip(gp_results_dict.items(), final_results.items()):
if (r[1] == 'PARTIAL' and f[1] == 'NO') or (r[1] == 'YES' and f[1] == 'NO'):
final_results[r[0]] = r[1]
elif (r[1] == 'YES' and f[1] == 'PARTIAL'):
final_results[r[0]] = r[1]
identifiers.pop(0)
i -= 1
outFileName = "results_kmer/" + fileName.replace(".tsv", ".csv")
pd.DataFrame.from_dict(final_results, orient='index', columns=[fileName.split('.tsv')[0]]).to_csv(outFileName)
if __name__ == "__main__":
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment