Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Overduin, Sam
TaxaSPAdes
Commits
8bc9c290
Commit
8bc9c290
authored
Feb 24, 2020
by
Overduin, Sam
Browse files
Upload scripts/append_taxatree_to_reads.py
parent
35b717e8
Changes
1
Hide whitespace changes
Inline
Side-by-side
scripts/append_taxatree_to_reads.py
0 → 100644
View file @
8bc9c290
#!/usr/bin/env python3
import
argparse
import
gzip
import
subprocess
import
datetime
def
get_reads_taxa_tree_map
(
kraken_file
,
read_pos
=
1
,
tax_pos
=
2
,
only_main_ranks
=
True
):
read_taxa_tree_dct
=
{}
rank_dict
=
{}
main_rank_set
=
set
([
'superkingdom'
,
'phylum'
,
'class'
,
'order'
,
'family'
,
'genus'
,
'species'
])
#import here so lower overhead and faster quit when called with bad options
from
ete3
import
NCBITaxa
ncbi
=
NCBITaxa
()
lineage_dict
=
{}
with
open
(
kraken_file
,
'r'
)
as
f
:
for
line
in
f
:
# n += 1
line_lst
=
line
.
strip
().
split
(
'
\t
'
)
taxa
=
line_lst
[
tax_pos
]
read
=
line_lst
[
read_pos
]
if
taxa
!=
'0'
:
try
:
taxa_tree
=
lineage_dict
[
taxa
]
except
KeyError
:
lineage_int_lst
=
ncbi
.
get_lineage
(
int
(
taxa
))
if
only_main_ranks
:
ranked_lineage
=
[]
for
t_id
in
lineage_int_lst
:
try
:
rank
=
rank_dict
[
t_id
]
except
KeyError
:
rank
=
ncbi
.
get_rank
([
t_id
])[
t_id
]
read_taxa_tree_dct
[
t_id
]
=
rank
if
rank
in
main_rank_set
:
ranked_lineage
.
append
(
t_id
)
lineage_int_lst
=
ranked_lineage
[:]
lineage_dict
[
taxa
]
=
'.'
.
join
(
map
(
str
,
lineage_int_lst
))
taxa_tree
=
lineage_dict
[
taxa
]
read_taxa_tree_dct
[
read
]
=
taxa_tree
return
read_taxa_tree_dct
def
taxa_tree_enrich_fastq_gz
(
fastq_file
,
output_file
,
read_taxa_tree_dct
):
if
fastq_file
.
endswith
(
'.gz'
):
fi
=
gzip
.
open
(
fastq_file
,
'rt'
)
else
:
fi
=
open
(
fastq_file
,
'r'
)
n
=
0
i
=
0
reads_with_taxa_tree
=
set
(
read_taxa_tree_dct
.
keys
())
compress_file
=
False
if
output_file
.
endswith
(
'.gz'
):
compress_file
=
True
output_file
=
output_file
[:
-
3
]
with
open
(
output_file
,
'w+'
)
as
fo
:
for
line
in
fi
:
n
+=
1
if
n
==
1
:
read
=
line
.
split
()[
0
][
1
:]
if
read
in
reads_with_taxa_tree
:
fo
.
write
(
line
.
strip
()
+
' TaxaTree:'
+
read_taxa_tree_dct
[
read
]
+
'
\n
'
)
else
:
fo
.
write
(
line
)
elif
n
==
4
:
fo
.
write
(
line
)
n
=
0
i
+=
1
if
i
%
100000
==
0
:
print
(
'Done '
+
str
(
i
)
+
' reads
\r
'
,
end
=
""
)
else
:
fo
.
write
(
line
)
fi
.
close
()
print
(
'
\n
'
,
datetime
.
datetime
.
now
(),
'Done writing'
,
n
,
'reads'
)
if
compress_file
:
print
(
datetime
.
datetime
.
now
(),
'Starting compression'
)
cmd
=
'pigz -p16 "'
+
output_file
+
'"'
subprocess
.
call
(
cmd
,
shell
=
True
)
print
(
datetime
.
datetime
.
now
(),
'Done compression'
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--input'
,
'-i'
,
help
=
"input fastq file"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--kraken'
,
'-k'
,
help
=
"input kraken file"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
'--read_pos'
,
'-rp'
,
help
=
"0-based index of read name in kraken file"
,
type
=
int
,
required
=
False
,
default
=
1
)
parser
.
add_argument
(
'--taxa_pos'
,
'-tp'
,
help
=
"0-based index of TaxID in kraken file"
,
type
=
int
,
required
=
False
,
default
=
2
)
parser
.
add_argument
(
'--main_ranks'
,
'-m'
,
help
=
"flag to use only major ranks in taxatree output"
,
required
=
False
,
dest
=
"main_ranks"
,
action
=
'store_true'
,
default
=
False
)
parser
.
add_argument
(
'--output'
,
'-o'
,
help
=
"output file"
,
type
=
str
,
required
=
True
)
args
=
parser
.
parse_args
()
print
(
datetime
.
datetime
.
now
(),
'started getting read taxonomy map'
)
read_taxa_tree_map
=
get_reads_taxa_tree_map
(
args
.
kraken
,
read_pos
=
args
.
read_pos
,
tax_pos
=
args
.
taxa_pos
,
only_main_ranks
=
args
.
main_ranks
)
print
(
datetime
.
datetime
.
now
(),
'done with read taxonomy map, adapting fastq headers'
)
taxa_tree_enrich_fastq_gz
(
args
.
input
,
args
.
output
,
read_taxa_tree_map
)
if
__name__
==
'__main__'
:
main
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment