Commit 8bc9c290 authored by Overduin, Sam's avatar Overduin, Sam
Browse files

Upload scripts/append_taxatree_to_reads.py

parent 35b717e8
#!/usr/bin/env python3
import argparse
import gzip
import subprocess
import datetime
def get_reads_taxa_tree_map(kraken_file, read_pos=1, tax_pos=2, only_main_ranks=True):
read_taxa_tree_dct = {}
rank_dict = {}
main_rank_set = set(['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'])
#import here so lower overhead and faster quit when called with bad options
from ete3 import NCBITaxa
ncbi = NCBITaxa()
lineage_dict = {}
with open(kraken_file, 'r') as f:
for line in f:
# n += 1
line_lst = line.strip().split('\t')
taxa = line_lst[tax_pos]
read = line_lst[read_pos]
if taxa != '0':
try:
taxa_tree = lineage_dict[taxa]
except KeyError:
lineage_int_lst = ncbi.get_lineage(int(taxa))
if only_main_ranks:
ranked_lineage = []
for t_id in lineage_int_lst:
try:
rank = rank_dict[t_id]
except KeyError:
rank = ncbi.get_rank([t_id])[t_id]
read_taxa_tree_dct[t_id] = rank
if rank in main_rank_set:
ranked_lineage.append(t_id)
lineage_int_lst = ranked_lineage[:]
lineage_dict[taxa] = '.'.join(map(str, lineage_int_lst))
taxa_tree = lineage_dict[taxa]
read_taxa_tree_dct[read] = taxa_tree
return read_taxa_tree_dct
def taxa_tree_enrich_fastq_gz(fastq_file, output_file, read_taxa_tree_dct):
if fastq_file.endswith('.gz'):
fi = gzip.open(fastq_file, 'rt')
else:
fi = open(fastq_file, 'r')
n = 0
i = 0
reads_with_taxa_tree = set(read_taxa_tree_dct.keys())
compress_file = False
if output_file.endswith('.gz'):
compress_file = True
output_file = output_file[:-3]
with open(output_file, 'w+') as fo:
for line in fi:
n += 1
if n == 1:
read = line.split()[0][1:]
if read in reads_with_taxa_tree:
fo.write(line.strip()+' TaxaTree:'+read_taxa_tree_dct[read]+'\n')
else:
fo.write(line)
elif n == 4:
fo.write(line)
n = 0
i += 1
if i % 100000 == 0:
print('Done '+str(i)+' reads\r', end="")
else:
fo.write(line)
fi.close()
print('\n', datetime.datetime.now(), 'Done writing', n, 'reads')
if compress_file:
print(datetime.datetime.now(), 'Starting compression')
cmd = 'pigz -p16 "'+output_file+'"'
subprocess.call(cmd, shell=True)
print(datetime.datetime.now(), 'Done compression')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--input', '-i', help="input fastq file",
type=str, required=True)
parser.add_argument('--kraken', '-k', help="input kraken file",
type=str, required=True)
parser.add_argument('--read_pos', '-rp', help="0-based index of read name in kraken file",
type=int, required=False, default=1)
parser.add_argument('--taxa_pos', '-tp', help="0-based index of TaxID in kraken file",
type=int, required=False, default=2)
parser.add_argument('--main_ranks', '-m', help="flag to use only major ranks in taxatree output",
required=False, dest="main_ranks", action='store_true', default=False)
parser.add_argument('--output', '-o', help="output file",
type=str, required=True)
args = parser.parse_args()
print(datetime.datetime.now(), 'started getting read taxonomy map')
read_taxa_tree_map = get_reads_taxa_tree_map(args.kraken, read_pos=args.read_pos, tax_pos=args.taxa_pos, only_main_ranks=args.main_ranks)
print(datetime.datetime.now(), 'done with read taxonomy map, adapting fastq headers')
taxa_tree_enrich_fastq_gz(args.input, args.output, read_taxa_tree_map)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment