Commit dd3d67d5 authored by Patino Medina, Laura's avatar Patino Medina, Laura
Browse files

Upload New File

parent c150339a
#!/usr/bin/env python3
"""Script to parse multiple topGO GeneTable into one file for all samples
Author: Anan Hu & Laura Patino Medina
Usage: python3 multiple_input.txt merge_output.txt
multiple_input.txt: file containing p values from DGE for all samples
merge_output.txt: file with GOenrichment P values(raw & adjusted)
for all samples.
#import module
from sys import argv
import subprocess
import os
def parse_input(file_name):
"""Parse a DGE file with p values.Parse GO output file for one sample.
file_name: txt file with p values of genes per sample.
Returns: a list of GO terms and a list of sample names
dge_output = open(file_name,"r")
for line in dge_output:
if line.startswith("Gene"):#adjust AGI
line = line.strip().split("\t")
leaves_s = line[1:]
go_terms =[]
annotations = {}
go_output = leaves_s[0] + ".txt"
with open(go_output) as first_go:
for line in first_go:
if line.startswith("GO:"):
line = line.strip().split("\t")
annotations[line[0]] = line[1]
return go_terms,leaves_s,annotations
def parse_gooutput(go_terms,leaf_names,annotations):
"""Substracts and parse the information from multiple GeneTables(topGO)
Returns:Dictionary, GO identifiers as keys and
a list of tuples with pvalues as values
#parse the output of topGO open file from the folder take the information
#data frame with samples(columns) and all pvalues, go terms(rows)
all_samples = {}#{go:[(raw_1,p adj_1),(raw_2,adj_2)...,go2}
for go_id in go_terms:
all_samples[go_id] = []
for i in range(len(leaf_names)):
go_output = leaf_names[i]+ ".txt"
with open(go_output) as curr_go:
for line in curr_go:
if line.startswith("GO:"):
line = line.strip().split("\t")
go = line[0]
p_vals = (float(line[5]),float(line[7]))
if go in all_samples:
if len(all_samples) != len(go_terms):
print("Parsing went wrong")
return all_samples
def write_merge_table(all_samples,leaf_names,annotations,out_name):
"""Writes a merge GO output for all samples
all_samples: dictionary {go_id:[(raw_1,p adj_1),()]} raw and adjusted p val for sample
leaf_names: list of leaves samples
annotations: Go descriptions
Returns a txt file
with open(out_name,"w") as merge_file:
header = "GO.ID" + "\t" + "Description"
for i in range(len(leaf_names)):#index -> len of leaf_names same len of key in one go_id
header = header + "\t" + leaf_names[i]+ "\t" + leaf_names[i]+"BY" # header = header + "\t" + leaf_names[i] current sample name
for go_id,vals in all_samples.items():
#vals is a list same order as leaf_names list
adjp = vals[0]#raw
go = go_id
ann = annotations[go_id]
line_p1 = "{}\t{}\t".format(go_id,ann)
line_p2 = str(vals).replace("[","")\
.replace("), (","\t").replace("]","")
line_p2 = line_p2.replace("(","")\
.replace(",", "\t").replace(")","")
line = "\n" + line_p1 + line_p2
def main():
"""Main function of the script
#find go terms and sample names
go_numbers, leaves,annotate= parse_input(argv[1])
#substracting data from different go tables
desire_data = parse_gooutput(go_numbers,leaves,annotate)
#write one table for all samples (with raw and FDR p val)
if __name__ == "__main__":
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment