Commit 37c9743a authored by Jasper Koehorst's avatar Jasper Koehorst
Browse files

reorganisation

parent d87894ff
hdt/
genome
xml
*.gff
MGnifyParser/build
MGnifyParser/gradle
GPs/pygenprop/__pycache__
MGnifyParser/src/test/resources/output
MGnifyParser/.gradle
MGnifyParser/.idea
.DS_Store
java/MGnifyParser/.gradle
GPs/__pycache__
GPs/pygenprop/__pycache__
GPs/pygenprop/dump/__pycache__
GPs/pygenprop/dump
hdt
python/gps
python/prot_tsv
java/MGnifyParser/.gradle
......@@ -49,10 +49,11 @@ def main(hdt_file):
get_genome_properties()
prot_tsv_file = export_tsv(hdt_file)
genome_properties(prot_tsv_file)
print(prot_tsv_file)
def export_tsv(hdt_file):
print("Exporting", hdt_file, "to tsv")
# Calling this function optimizes the RDFlib SPARQL engine for HDT documents
optimize_sparql()
......@@ -76,6 +77,7 @@ def export_tsv(hdt_file):
sample_name = f"{sampleRow.samplename}"
outFileName = 'prot_tsv/' + sample_name + ".tsv"
# If output file exists, skip this sample
if os.path.isfile(outFileName): continue
......@@ -153,9 +155,11 @@ def export_tsv(hdt_file):
def genome_properties(input_file):
print("Processing", input_file, "to genome properties")
if not os.path.isdir("./gps/"): os.mkdir("./gps")
gp_file = "./gps/" + input_file.split("/")[-1].split(".")[0] + ".gp.tsv"
if os.path.isfile(gp_file): return
with open(input_file) as interproscan_file:
final_results = {}
......@@ -187,10 +191,10 @@ def genome_properties(input_file):
partial = partial - yes
gp_results_dict = dict.fromkeys(yes, "YES")
gp_results_dict.update(dict.fromkeys(partial, "PARTIAL"))
print(gp_results_dict)
# print(gp_results_dict)
df = pd.DataFrame.from_dict(gp_results_dict, orient='index', columns=[input_file.split('.tsv')[0]])
print(df)
# print(df)
df.columns = [input_file.split("/")[-1].split(".")[0]]
content = df.to_csv(sep="\t")
......@@ -222,8 +226,7 @@ def parallel(hdt_files):
process.add(hdt_file)
# Parallel function
results = Parallel(n_jobs=num_cores)(delayed(main)(i) for i in process)
results = Parallel(n_jobs=num_cores)(delayed(main)(i) for i in sorted(process))
if __name__ == "__main__":
# If sys.argv[1] == a dir
......@@ -234,7 +237,42 @@ if __name__ == "__main__":
hdt_files = os.listdir(sys.argv[1])
for index, hdt_file in enumerate(hdt_files):
hdt_files[index] = sys.argv[1] + hdt_file
parallel(hdt_files)
# parallel(hdt_files)
# Collect the outputs
# Taxon lineage
mgnify_lineage = {}
for line in open("silva-lineage_to_genome/Silva138.1-Lineage_UHGG-mags_merged.tsv"):
lineage, mgygs = line.split("\t")
for mgyg in mgygs.split(","):
mgnify_lineage[mgyg.strip()] = lineage
print("mgnify_lineage", len(mgnify_lineage))
# Genome property into matrix
matrix = []
for file in os.listdir("./gps"):
lines = open("./gps/" + file).readlines()
for index, line in enumerate(lines):
if index == 0:
header = lines[index].strip()
if header in mgnify_lineage:
matrix.append([header, "taxonomy", mgnify_lineage[header]])
else:
print(header, " is unknown")
matrix.append([header, "taxonomy", "unknown"])
else:
key, value = line.strip().split()
matrix.append([header, key, value])
# Turn into panda matrix
df = pd.DataFrame(matrix, columns=['X','Y','Z'])
df = df.pivot(index='X', columns='Y', values='Z')
df.fillna('NO', inplace=True)
df.to_csv("matrix.tsv", sep="\t")
else:
# hdt_file = "../hdt/MGYG-HGUT-01159.interproscan.eggnog.hdt" # sys.argv[1]
main(sys.argv[1])
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment