Commit 004b0591 authored by Schoorlemmer, Joran's avatar Schoorlemmer, Joran
Browse files

Upload parsing py script for clustering

parent 3e1efcc9
#!/usr/bin/env python3
"""
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Write log2fold and padj values from tsv files into two csv files
Usage: python3 extractdiffanalysis.py <listofsamplefiles.txt> \
<log2foldoutputfilename.txt> <padjoutputfilename.txt>
"""
# Import
from sys import argv
# Functions
def extractlogfold(csv_file):
"""Extract fold changes from csv file
csv_file: str, name of input sample csv file
fold_dict: dict, dict with gene names as keys and l2f changes as values
"""
with open(csv_file, 'r') as logf:
logf.readline()
fold_dict = {} # init dict
for line in logf:
line.strip()
line_info = line.split('\t') # write tsv line to list
gene_name = line_info[0]
log2fold = float(line_info[2])
fold_dict[gene_name] = log2fold
return fold_dict
def extractpadj(csv_file):
"""Extract padj values from csv file
csv_file: str, name of input sample csv file
p_dict: dict, dict with gene names as keys and l2f changes as values
"""
with open(csv_file, 'r') as logf:
logf.readline()
p_dict = {} # init dict
for line in logf:
line.strip()
line_info = line.split('\t') # write tsv line to list
gene_name = line_info[0]
padj = float(line_info[6].strip())
p_dict[gene_name] = padj
return p_dict
def writecsv(sample_dict, out_file):
"""Write sample dict into csv file
sample_dict: dict, dict with samples as keys and dicts with gene names as
keys as values
out_file: str, filename of output csv file
"""
with open(out_file, 'w') as out_f:
out_f.write('Gene\t')
gene_names = set()
# write samples and find gene names
for sample in sample_dict:
gene_names.update(list(sample_dict[sample].keys()))
out_f.write(str(sample[0:4])+"\t") # parse sample name
out_f.write('\n')
for gene in gene_names:
out_f.write(str(gene)+"\t")
for sample in sample_dict:
try:
out_f.write(str(sample_dict[sample][gene])+'\t')
except: # if gene is not present in csv, write NA
out_f.write('NA\t')
out_f.write('\n')
# Main
def main(csv_names, out_file_fold, out_file_p):
"""Run functions and loop trough sample files
csv_names: str, filename of txt file with sample file names
out_file_fold: str, filename of l2f output csv file
out_file_p: str, filename of padj output csv file
"""
with open(csv_names,'r') as f_names:
# init dicts
dict_all_fold = {}
dict_all_padj = {}
# loop trough files
for fn in f_names:
fn = fn.strip('\n')
fold_dict = extractlogfold(fn)
p_dict = extractpadj(fn)
dict_all_fold[fn] = fold_dict
dict_all_padj[fn] = p_dict
# write csv files
writecsv(dict_all_fold, out_file_fold)
writecsv(dict_all_padj, out_file_p)
# Run script
if __name__ == "__main__":
main(argv[1], argv[2], argv[3])
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment