Commit dd6e6131 authored by Eendebak, Liset's avatar Eendebak, Liset
Browse files

Merge branch 'main' of git.wur.nl:araformatics-group/araformatics-project into main

parents 40a0c942 be6de690
......@@ -3,7 +3,7 @@
Author: Joran Schoorlemmer
Student nr: 1004586
Description: Extract counts from coverage data in gtf files and write to csv
Usage: python3 extractcounts.py <reference.gtf> <listofsamplegtffiles.txt>
Usage: python3 extractcounts.py <listofreffiles.txt> <listofsamplegtffiles.txt>
"""
......@@ -32,9 +32,10 @@ def parse_gtf(in_file, read_length = 100):
# turn into into int to do calculations
trans_len = abs(int(line_cont[4])-int(line_cont[3]))
gene_info = line_cont[8].split(' ')
gene_name = gene_info[5].lstrip('"').rstrip('";')
cov = float(gene_info[11].lstrip('"').rstrip('";'))
genes[gene_name] = round(cov * trans_len /read_length)
if len(gene_info) == 16:
gene_name = gene_info[9].lstrip('"').rstrip('";')
cov = float(gene_info[11].lstrip('"').rstrip('";'))
genes[gene_name] = round(cov * trans_len /read_length)
return genes
......@@ -50,7 +51,8 @@ def group_replicates(sample_names):
with open(sample_names) as f_names:
for f in f_names:
f = f.strip("\n")
rep_dict_key = f[:-5]
# strip redundant info out of string
rep_dict_key = f[9:-12]
if not rep_dict_key in rep_dict:
rep_dict[rep_dict_key] = {}
rep_dict[rep_dict_key][f] = parse_gtf('samples/'+ str(f))
......@@ -61,25 +63,30 @@ def group_replicates(sample_names):
def write_csv(ref_dict, group_dict, out_f):
"""write a tab delim csv file with genes and count values
ref_dict: dict, dict with gene name as keys and counts as values
ref_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
group_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
out_f: str, output file name in csv format
"""
with open(out_f, 'w') as f:
f.write("gene_name\treference\t")
f.write("gene_name\t")
# create set of gene names to make sure all rows are present and unique
gene_names = set(list(ref_dict.keys()))
gene_names = set()
for ref in ref_dict:
gene_names.update(list(ref_dict[ref].keys()))
f.write("ref_" + str(ref)+"\t")
for replicate in group_dict:
gene_names.update(list(group_dict[replicate].keys()))
f.write(str(replicate)+"\t")
f.write("\n")
for gene in gene_names:
f.write(str(gene)+"\t")
try:
f.write(str(ref_dict[gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for ref in ref_dict:
try:
f.write(str(ref_dict[ref][gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for replicate in group_dict:
try:
f.write(str(group_dict[replicate][gene])+'\t')
......@@ -91,20 +98,22 @@ def write_csv(ref_dict, group_dict, out_f):
def main(ref_name, sample_names):
"""Run functions and loop trough sample files
ref_name: str, filename of reference gtf file
ref_name: str, filename of txt file containing filenames of references
sample_names: str, filename of txt file containing filenames of samples
"""
ref = parse_gtf(ref_name)
ref_dict = group_replicates(ref_name)
ref_key = list(ref_dict.keys())
ref_dict = ref_dict[ref_key[0]]
print("parsed: reference")
dict_all = group_replicates(sample_names)
print("all gtf files parsed\n")
for group in dict_all: #loop trough
# write files to output folder counts/
out_file = 'counts/' + str(group) + 'counts.csv'
write_csv(ref, dict_all[group], out_file)
write_csv(ref_dict, dict_all[group], out_file)
print("csv written: "+ str(group))
# Run script
if __name__ == "__main__":
main(argv[1], argv[2])
\ No newline at end of file
main(argv[1], argv[2])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment