Commit bf8bcf12 authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

restructured the code

parent 887d7415
genome
xml
*.gff
MGnifyParser/build
MGnifyParser/gradle
GPs/pygenprop/__pycache__
MGnifyParser/src/test/resources/output
MGnifyParser/.gradle
MGnifyParser/.idea
hdt/
.DS_Store
GPs/__pycache__
GPs/pygenprop/__pycache__
GPs/pygenprop/dump/__pycache__
GPs/pygenprop/dump
>>>>>>> e29354cfbd7fdfb2bf03a64260bf3422344b5e80
java/MGnifyParser/.gradle
python/gps
python/prot_tsv
This diff is collapsed.
#!/usr/bin/env python3
"""
Author: Wasin Poncheewin, Bart Nijsse, Jasper Koehorst
Script to build protein tsv file with all info
"""
import os, subprocess, sys, re
from rdflib import Graph
from rdflib_hdt import HDTStore, optimize_sparql
def main(hdtfile):
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
graph = Graph(store=HDTStore(hdtfile))
samplesResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
?s gbol:sample ?sample .
?sample gbol:name ?samplename .
}
""")
if not os.path.exists('prot_tsv/'):
os.makedirs('prot_tsv', exist_ok=True)
# subprocess.check_output('mkdir prot_tsv', shell=True)
# Multiple samples possible
sample_store = {}
for sampleRow in samplesResults:
samplename = f"{sampleRow.samplename}"
print('%s' % samplename)
sampleIRI = (f"{sampleRow.sample}")
interproResults = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<""" + sampleIRI + """>}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
data = []
for interproRow in interproResults:
gene = f"{interproRow.gene}"
genename = re.sub(r'gene/.*', '', gene)
strand = f"{interproRow.strand}"
strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
gbeginpos = f"{interproRow.gbeginpos}"
gendpos = f"{interproRow.gendpos}"
cdsbeginpos = f"{interproRow.cdsbeginpos}"
cdsendpos = f"{interproRow.cdsendpos}"
featurebeginpos = f"{interproRow.featurebeginpos}"
featureendpos = f"{interproRow.featureendpos}"
acc = f"{interproRow.acc}"
singnaturedes = f"{interproRow.singnaturedes}"
dbname = f"{interproRow.dbname}"
evalue = f"{interproRow.evalue}"
data += [(samplename, gene, strandname, int(gbeginpos), int(gendpos), int(cdsbeginpos), int(cdsendpos),
int(featurebeginpos), int(featureendpos), acc, singnaturedes, dbname, float(evalue))]
data.sort()
sample_store[samplename] = data
return sample_store
if __name__ == "__main__":
sample_store = main(sys.argv[1])
for sample_name in sample_store:
outFileName = 'prot_tsv/' + sample_name + ".tsv"
outFile = open(outFileName, 'w')
for element in sample_store[sample_name]:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
#!/usr/bin/env python3
"""
Author: Wasin
Script to build protein tsv file with out strand information
"""
import re, os, sys
from rdflib import Graph
from rdflib_hdt import HDTStore, optimize_sparql
def main(hdt_file):
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
graph = Graph(store=HDTStore(hdt_file))
results_sample = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?sample ?samplename
WHERE {
?s gbol:sample ?sample .
?sample gbol:name ?samplename .
}
""")
if not os.path.exists('prot_tsv_nostrand/'):
os.makedirs('mkdir prot_tsv_nostrand', exist_ok = True)
sample_store = {}
for result_sample in results_sample:
samplename = f"{result_sample.samplename}"
# outFileName = 'prot_tsv_nostrand/' + samplename + ".tsv"
# outFile = open(outFileName, 'w')
sample_iri = (f"{result_sample.sample}")
interpro_results = graph.query("""
PREFIX gbol: <http://gbol.life/0.1/>
SELECT DISTINCT ?gene ?strand ?gbeginpos ?gendpos ?cdsbeginpos ?cdsendpos ?featurebeginpos ?featureendpos ?acc ?singnaturedes ?dbname ?evalue
WHERE {
?s gbol:sample ?sample .
VALUES ?sample {<"""+sample_iri+""">}
?s gbol:feature ?gene.
?gene gbol:location ?glocation .
?glocation gbol:begin ?gbegin .
?gbegin gbol:position ?gbeginpos .
?glocation gbol:end ?gend .
?gend gbol:position ?gendpos .
?glocation gbol:strand ?strand .
?gene gbol:transcript ?mrna .
?mrna gbol:feature ?cds .
?cds gbol:location ?cdslocation .
?cdslocation gbol:begin ?cdsbegin .
?cdsbegin gbol:position ?cdsbeginpos .
?cdslocation gbol:end ?cdsend .
?cdsend gbol:position ?cdsendpos .
?cds gbol:protein ?prot .
?prot gbol:feature ?feature .
?feature gbol:signature ?signature .
?signature gbol:accession ?acc .
?signature gbol:db ?db .
?db gbol:id ?dbname .
?feature gbol:signatureDesc ?singnaturedes .
?feature gbol:location ?featurelocation .
?featurelocation gbol:begin ?featurebegin .
?featurebegin gbol:position ?featurebeginpos .
?featurelocation gbol:end ?featureend .
?featureend gbol:position ?featureendpos .
?feature gbol:provenance ?provenance .
?provenance gbol:annotation ?annot .
?annot gbol:evalue ?evalue .
}
""")
# sparql.setReturnFormat(JSON)
# results_interpro = sparql.query().convert()
data = []
for row in interpro_results:
# samplename = f"{row.samplename}"
gene = f"{row.gene}"
genename = re.sub(r'gene/.*', '', gene)
# strand = result_interpro["strand"]["value"]
# strandname = re.sub(r'http://gbol\.life/0\.1/', '', strand)
strandname = genename
# gbeginpos = result_interpro["gbeginpos"]["value"]
# gendpos = result_interpro["gendpos"]["value"]
# cdsbeginpos = result_interpro["cdsbeginpos"]["value"]
# cdsendpos = result_interpro["cdsendpos"]["value"]
# featurebeginpos = result_interpro["featurebeginpos"]["value"]
# featureendpos = result_interpro["featureendpos"]["value"]
# acc = result_interpro["acc"]["value"]
# singnaturedes = result_interpro["singnaturedes"]["value"]
# dbname = result_interpro["dbname"]["value"]
# evalue = result_interpro["evalue"]["value"]
gbeginpos = f"{row.gbeginpos}"
gendpos = f"{row.gendpos}"
cdsbeginpos = f"{row.cdsbeginpos}"
cdsendpos = f"{row.cdsendpos}"
featurebeginpos = f"{row.featurebeginpos}"
featureendpos = f"{row.featureendpos}"
acc = f"{row.acc}"
singnaturedes = f"{row.singnaturedes}"
dbname = f"{row.dbname}"
evalue = f"{row.evalue}"
data += [(samplename,genename,strandname,int(gbeginpos),int(gendpos),int(cdsbeginpos),int(cdsendpos),int(featurebeginpos),int(featureendpos),acc,singnaturedes,dbname,float(evalue))]
data.sort()
sample_store[samplename] = data
return sample_store
if __name__ == "__main__":
sample_store = main(sys.argv[1])
for sample_name in sample_store:
outFileName = 'prot_tsv/' + sample_name + ".tsv"
outFile = open(outFileName, 'w')
for element in sample_store[sample_name]:
outFile.write('\t'.join(str(x) for x in element) + '\n')
outFile.close()
#!/usr/bin/env python3
"""
Author: Wasin
Script to run modified PYGENPROP (original version)
"""
import os
import subprocess
from os import listdir
from os.path import isfile, join
import requests
from io import StringIO
from pygenprop.results import GenomePropertiesResults
from pygenprop.database_file_parser import parse_genome_properties_flat_file
from pygenprop.assignment_file_parser import parse_interproscan_file, parse_genome_property_longform_file
# The Genome Properties is a flat file database that can be fount on Github.
# The latest release of the database can be found at the following URL.
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
# For this tutorial we will stream the file directly into the Jupyter notebook. Alternativly,
# one could be downloaded the file with the unix wget or curl commands.
with requests.Session() as current_download:
response = current_download.get(genome_properties_database_url, stream=True)
tree = parse_genome_properties_flat_file(StringIO(response.text))
if not os.path.exists('results/'):
subprocess.check_output('mkdir results', shell=True)
onlyfiles = [f for f in listdir('prot_tsv/') if isfile(join('prot_tsv/', f)) and f.endswith(".tsv")]
for f in onlyfiles:
print (f)
with open('prot_tsv/'+f) as inputFile:
assignment = parse_interproscan_file(inputFile)
results = GenomePropertiesResults(assignment, properties_tree=tree)
outFileName = "results/" + f.replace(".tsv", ".csv")
results.property_results.to_csv(outFileName)
# Parse output file and add description?
\ No newline at end of file
#!/usr/bin/env python3
"""
Author: Wasin
Script to run modified PYGENPROP with strand information
"""
import os, sys
import csv
import pandas as pd
from os.path import basename, splitext
import requests
from io import StringIO
import sys
import pathlib
from pygenprop import assign
from pygenprop.assign import AssignmentCache
from pygenprop.results import GenomePropertiesResults
from pygenprop.database_file_parser import parse_genome_properties_flat_file
import logging
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s')
def main(input_file, kmer_size):
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
if not os.path.isfile('genomeProperties.txt'):
logging.info('Downloading the genome properties file')
req = requests.get(genome_properties_database_url)
#open method will open a file on your system and write the contents
with open('genomeProperties.txt', 'wb') as f:
f.write(req.content)
content = open('genomeProperties.txt').readlines()
tree = parse_genome_properties_flat_file(content)
with open(input_file) as interproscan_file:
final_results = {}
identifiers = []
strand = "ForwardStrandPosition"
i = 0
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for index, row in enumerate(tsv_reader):
if index % 100 == 0 and index > 0:
print('Processed ' + str(index) + ' entries')
current_strand = row[2]
if current_strand != strand:
strand = current_strand
identifiers = []
i = 0
matched_interpro_member_database_id = row[9]
identifiers.append(matched_interpro_member_database_id)
i += 1
if i == kmer_size:
assignment_cache = assign.AssignmentCache(interpro_signature_accessions=identifiers, sample_name=splitext(basename(interproscan_file.name))[0])
results = GenomePropertiesResults(assignment_cache, properties_tree=tree)
gp_results_dict = results.property_results.to_dict()[input_file.split('/')[-1].split('.tsv')[0]]
if not final_results:
final_results = gp_results_dict
else:
for r, f in zip(gp_results_dict.items(), final_results.items()):
if (r[1] == 'PARTIAL' and f[1] == 'NO') or (r[1] == 'YES' and f[1] == 'NO'):
final_results[r[0]] = r[1]
elif (r[1] == 'YES' and f[1] == 'PARTIAL'):
final_results[r[0]] = r[1]
identifiers.pop(0)
i -= 1
return pd.DataFrame.from_dict(final_results, orient='index', columns=None) # [input_file.split('.tsv')[0]]
if __name__ == "__main__":
# input_file = sys.argv[1]
input_file = "/Users/jasperk/gitlab/m-unlock/notebook/notebooks/data/mgnify/prot_tsv/MGYG-HGUT-00001.tsv"
pd_df = main(input_file, kmer_size=20)
print(pd_df)
\ No newline at end of file
#!/usr/bin/env python3
"""
Author: Wasin
Script to run modified PYGENPROP without strand information
"""
import os
import csv
import pandas as pd
from os.path import basename, splitext
import requests
from pygenprop import assign
from pygenprop.results import GenomePropertiesResults
from pygenprop.database_file_parser import parse_genome_properties_flat_file
from pygenprop.assignment_file_parser import parse_interproscan_file, parse_genome_property_longform_file
def main(input_file, kmer_size):
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
if not os.path.isfile('genomeProperties.txt'):
print('Downloading the genome properties file')
req = requests.get(genome_properties_database_url)
#open method will open a file on your system and write the contents
with open('genomeProperties.txt', 'wb') as f:
f.write(req.content)
print("Loading genome properties")
content = open('genomeProperties.txt').readlines()
tree = parse_genome_properties_flat_file(content)
with open(input_file) as interproscan_file:
final_results = {}
identifiers = []
strand = "x"
i = 0
tsv_reader = csv.reader(interproscan_file, delimiter='\t')
for index, row in enumerate(tsv_reader):
# if index % 100 == 0:
# print (index)
current_strand = row[2]
if current_strand != strand:
strand = current_strand
identifiers = []
i = 0
matched_interpro_member_database_id = row[9]
identifiers.append(matched_interpro_member_database_id)
i += 1
if i == kmer_size:
assignment_cache = assign.AssignmentCache(interpro_signature_accessions=identifiers, sample_name=splitext(basename(interproscan_file.name))[0])
results = GenomePropertiesResults(assignment_cache, properties_tree=tree)
gp_results_dict = results.property_results.to_dict()[input_file.split("/")[-1].split('.tsv')[0]]
if not final_results:
final_results = gp_results_dict
else:
for r, f in zip(gp_results_dict.items(), final_results.items()):
if (r[1] == 'PARTIAL' and f[1] == 'NO') or (r[1] == 'YES' and f[1] == 'NO'):
final_results[r[0]] = r[1]
elif (r[1] == 'YES' and f[1] == 'PARTIAL'):
final_results[r[0]] = r[1]
identifiers.pop(0)
i -= 1
return pd.DataFrame.from_dict(final_results, orient='index', columns=[input_file.split('.tsv')[0]])
if __name__ == "__main__":
# input_file = sys.argv[1]
input_file = "/Users/jasperk/gitlab/m-unlock/notebook/notebooks/data/mgnify/prot_tsv/MGYG-HGUT-00001.tsv"
pd_df = main(input_file, kmer_size=20)
print(pd_df)
\ No newline at end of file
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-6.5-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment