Commit 4fa1914c authored by Schoorlemmer, Joran's avatar Schoorlemmer, Joran
Browse files

Replace extractcounts.py with ref_txtfile functionality

parent ceefbe8d
......@@ -33,7 +33,7 @@ def parse_gtf(in_file, read_length = 100):
trans_len = abs(int(line_cont[4])-int(line_cont[3]))
gene_info = line_cont[8].split(' ')
if len(gene_info) == 16:
gene_name = gene_info[9].lstrip('"').rstrip('";')
gene_name = gene_info[5].lstrip('"').rstrip('";')
cov = float(gene_info[11].lstrip('"').rstrip('";'))
genes[gene_name] = round(cov * trans_len /read_length)
return genes
......@@ -52,8 +52,9 @@ def group_replicates(sample_names):
for f in f_names:
f = f.strip("\n")
# strip redundant info out of string
fn = f[:-4].split('-')
rep_dict_key = fn[0] + fn[1][0:3]+ fn[1][4:]
rep_dict_key = f[:-5]
#fn = f[:-4].split('-')
#rep_dict_key = fn[0] + fn[1][0:3]+ fn[1][4:]
if not rep_dict_key in rep_dict:
rep_dict[rep_dict_key] = {}
rep_dict[rep_dict_key][f] = parse_gtf('samples/'+ str(f))
......@@ -64,25 +65,30 @@ def group_replicates(sample_names):
def write_csv(ref_dict, group_dict, out_f):
"""write a tab delim csv file with genes and count values
ref_dict: dict, dict with gene name as keys and counts as values
ref_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
group_dict: dict, dict with replicate as keys and dict with gene names as
keys and counts as values as values
out_f: str, output file name in csv format
"""
with open(out_f, 'w') as f:
f.write("gene_name\treference\t")
f.write("gene_name\t")
# create set of gene names to make sure all rows are present and unique
gene_names = set(list(ref_dict.keys()))
gene_names = set()
for ref in ref_dict:
gene_names.update(list(ref_dict[ref].keys()))
f.write("ref_" + str(ref)+"\t")
for replicate in group_dict:
gene_names.update(list(group_dict[replicate].keys()))
f.write(str(replicate)+"\t")
f.write("\n")
for gene in gene_names:
f.write(str(gene)+"\t")
try:
f.write(str(ref_dict[gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for ref in ref_dict:
try:
f.write(str(ref_dict[ref][gene])+'\t')
except: # if gene is not present in gtf, write 0 counts
f.write('0\t')
for replicate in group_dict:
try:
f.write(str(group_dict[replicate][gene])+'\t')
......@@ -94,17 +100,19 @@ def write_csv(ref_dict, group_dict, out_f):
def main(ref_name, sample_names):
"""Run functions and loop trough sample files
ref_name: str, filename of reference gtf file
ref_name: str, filename of txt file containing filenames of references
sample_names: str, filename of txt file containing filenames of samples
"""
ref = parse_gtf(ref_name)
ref_dict = group_replicates(ref_name)
ref_key = list(ref_dict.keys())
ref_dict = ref_dict[ref_key[0]]
print("parsed: reference")
dict_all = group_replicates(sample_names)
print("all gtf files parsed\n")
for group in dict_all: #loop trough
# write files to output folder counts/
out_file = 'counts/' + str(group) + 'counts.csv'
write_csv(ref, dict_all[group], out_file)
write_csv(ref_dict, dict_all[group], out_file)
print("csv written: "+ str(group))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment