Commit ae290f75 authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. taxaSPAdes as option with --taxonomy as argument & some clean-up



config.info:
- Added taxonomy false.

barcode_index_builder.hpp:
- only extract TaxaTree if cfg::get().taxonomy

scaffold_graph_construction_pipeline.cpp:
- only add TaxaBreak constructor if cfg::get().taxonomy

config_struct.cpp & hpp:
- added taxonomy to cfg.

barcode_index_construction.cpp:
- Only decide LCA if cfg::get().taxonomy

spades_iteration_stage.py:
- Add taxonomy option to config.info of final iteration

options_parser.py:
- Add --taxonomy argument. Only usable with --meta and --gemcode options.

Some various other clean-ups in files:
- aa_todo_notes
- path_polisher.hpp
- path_scaffolder.cpp
- scaffold_vertex.cpp
Signed-off-by: Overduin, Sam's avatarOverduin, Sam <sam.overduin@wur.nl>
parent a3d9211c
##append_taxa_tree_to_reads.py
from kraken to read header -> done!!
To test:
/mnt/LTR_userdata/overd008/tools/spades_install/bin/spades-core /home/overd008/mnt/lustre/output/spades/test/K55/configs/config.info /home/overd008/mnt/lustre/output/spades/test/K55/configs/mda_mode.info /home/overd008/mnt/lustre/output/spades/test/K55/configs/meta_mode.info
## Parse taxonomy from reads --> done!
Barcode mapping parser
common\barcode_index\barcode_index_builder.hpp @105 FillMapFrom10XReads
@118 add string tax_string = GetTaxaTreeFromRead(read)
# Encode as vector of pointers to ints?
@150 copy similar parser
## Settle taxonomy of edges --> done!
New function (implement where? Just before transition taxa filter? Seperately?)
---------here-----------
## Add transition taxonomy filter here:
M:\aa_Thesis_code\SPAdes-3.13.0-dev\src\common\modules\path_extend\read_cloud_path_extend\scaffold_graph_construction\
scaffold_graph_construction_pipeline.cpp@209
......@@ -44,6 +44,9 @@ scaffold_correction_mode false
; enabled (1) or disabled (0) repeat resolution (former "paired_mode")
rr_enable true
; use of taxonomy for linked read & metagenome assembly enabled or disabled
taxonomy false
;preserve raw paired index after distance estimation
preserve_raw_paired_index false
......
......@@ -89,7 +89,7 @@ std::string EdgeIdVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::share
}
else if (barcode_extractor_->EdgeIsValid(this->GetConjugateFromGraph(g)->GetId())) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(this->GetConjugateFromGraph(g)->GetId());
DEBUG_SAM("No taxatree for vector: " << edge_int << ", conjugate was fine: " <<
INFO("No taxatree for edge: " << edge_int << ", conjugate was fine: " <<
this->GetConjugateFromGraph(g)->GetId() << " TaxaTree: " << taxatree);
}
else {
......@@ -190,41 +190,6 @@ size_t PathVertex::GetSize() const {
return path_->Size();
}
// Consider changing below to more of an LCA type deal with drilldowns etc.
// TODO: delete this function, now done LCA with more stringent parameters than reads -> contig LCA
std::string PathVertex::TaxaTreeFromCountMap(std::unordered_map<std::string, size_t> &taxatree_counts) const {
double frac_match = 0.9;
std::string taxatree;
size_t total_counts = 0;
size_t common_taxa_count = 0;
for (const auto &entry: taxatree_counts) {
total_counts += entry.second;
if (entry.second > common_taxa_count) {
common_taxa_count = entry.second;
taxatree = entry.first;
}
}
size_t max_mismatches = frac_match * total_counts;
size_t mismatches = 0;
for (const auto &entry: taxatree_counts) {
if (entry.first == "0")
continue;
if (entry.first != taxatree) {
DEBUG("Path TaxaTree mismatch! main taxatree: " << taxatree << " other_taxatree: " << entry.first);
mismatches += entry.second;
}
if (mismatches > max_mismatches) {
DEBUG_SAM("Path: " << this->GetId() << " contains fully mismatching taxatrees:");
for (const auto &entry2: taxatree_counts) {
DEBUG_SAM("Taxatree: " << entry2.first << " Count: " << entry2.second);
}
return "0";
}
}
return taxatree;
}
std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_) const {
......@@ -238,7 +203,7 @@ std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g,
}
else if (barcode_extractor_->EdgeIsValid(g.conjugate(edge_int))) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(g.conjugate(edge_int));
DEBUG_SAM("No taxatree for vector: " << edge_int << ", conjugate was fine: " <<
INFO("No taxatree for edge: " << edge_int << ", conjugate was fine: " <<
g.conjugate(edge_int).int_id() << " TaxaTree: " << taxatree);
}
else {
......
......@@ -7,10 +7,14 @@
#pragma once
#include "barcode_index.hpp"
#include "modules/alignment/edge_index.hpp"
#include "modules/alignment/kmer_mapper.hpp"
#include "modules/alignment/sequence_mapper.hpp"
// Added to gain access to cfg::get(). Probably a little bit overkill
#include "common/pipeline/stage.hpp"
namespace barcode_index {
template<class Graph, class BarcodeEntryT>
......@@ -114,11 +118,13 @@ namespace barcode_index {
size_t counter = 0;
const std::vector<string> barcode_prefixes = {"BC:Z:", "BX:Z:"};
const std::vector<string> taxatree_prefixes = {"TaxaTree:"};
bool read_taxonomy_enabled = cfg::get().taxonomy;
for (auto stream: reads) {
while (!stream->eof()) {
*stream >> read;
string barcode_string = GetTenXBarcodeFromRead(read, barcode_prefixes);
string taxatree_string = GetTaxaTreeFromRead(read, taxatree_prefixes);
if (barcode_string != "") {
barcode_codes_.AddBarcode(barcode_string);
......@@ -127,19 +133,21 @@ namespace barcode_index {
const auto &path = mapper->MapRead(read);
InsertMappingPath(barcode, path);
}
if (read_taxonomy_enabled) {
string taxatree_string = GetTaxaTreeFromRead(read, taxatree_prefixes);
if (taxatree_string == "") {
// no TaxaTree added means no known taxa (0), useful during LCA determination.
taxatree_string = '0';
}
if (taxatree_string == "") {
// no TaxaTree added means no known taxa (0), useful during LCA determination.
taxatree_string = '0';
index_.taxatree_codes_.AddTaxaTree(taxatree_string);
uint64_t taxid_int = index_.taxatree_codes_.GetCode(taxatree_string);
TaxIdT taxid(taxid_int);
//INFO("taxatree: "+taxatree_string)
const auto &path = mapper->MapRead(read);
InsertTaxIdMappingPath(taxid, path);
}
index_.taxatree_codes_.AddTaxaTree(taxatree_string);
uint64_t taxid_int = index_.taxatree_codes_.GetCode(taxatree_string);
TaxIdT taxid(taxid_int);
//INFO("taxatree: "+taxatree_string)
const auto &path = mapper->MapRead(read);
InsertTaxIdMappingPath(taxid, path);
counter++;
VERBOSE_POWER_T2(counter, 100, "Processed " << counter << " reads.");
}
......
......@@ -30,7 +30,7 @@ void SimplePathScaffolder::CondenseSimplePaths(const std::vector<ScaffoldEdge> &
merge_connections.at(end_conjugate) != start_conjugate) {
WARN("Conjugate connection does not correspond to direct connection")
if (merge_connections.find(end_conjugate) == merge_connections.end()) {
DEBUG_SAM("Adding conjugates to merge connection");
INFO("Adding conjugates to merge connection");
merge_connections[end_conjugate] = start_conjugate;
} else {
merge_connections.at(end_conjugate) = start_conjugate;
......
......@@ -243,7 +243,9 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> FullScaffo
unique_storage_, search_parameter_pack_,
read_cloud_configs_.scaff_con, max_threads_,
scaffolding_mode));
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
if (cfg::get().taxonomy) {
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
}
const size_t min_pipeline_length = read_cloud_configs_.long_edge_length_lower_bound;
bool launch_full_pipeline = min_length_ > min_pipeline_length;
......@@ -319,7 +321,9 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> MergingSca
split_scaffold_index_extractor,
max_threads_));
iterative_constructor_callers.push_back(std::make_shared<TransitiveConstructorCaller>(g_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
if (cfg::get().taxonomy) {
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
}
return iterative_constructor_callers;
}
} //path_extend
......
......@@ -18,7 +18,6 @@
#include "assembly_graph/dijkstra/dijkstra_helper.hpp"
namespace path_extend {
static const size_t MAGIC_LOOP_CONSTANT = 1000;
class PathGapCloser {
protected:
......@@ -151,6 +150,7 @@ class SimpleExtenderFactory : public GapExtenderFactory {
const GraphCoverageMap &cover_map_;
UsedUniqueStorage& unique_;
const std::shared_ptr<GapExtensionChooserFactory> chooser_factory_;
static const size_t MAGIC_LOOP_CONSTANT = 1000;
public:
SimpleExtenderFactory(const conj_graph_pack &gp,
const GraphCoverageMap &cover_map,
......@@ -168,8 +168,7 @@ public:
DEBUG("Created chooser");
auto extender = std::make_shared<SimpleExtender>(gp_, cover_map_, unique_,
chooser,
path_extend::MAGIC_LOOP_CONSTANT,
//changed location of MAGIC_LOOP_CONSTANT because CLion compilation threw errors
MAGIC_LOOP_CONSTANT,
false,
false);
DEBUG("Returning extender");
......
......@@ -634,6 +634,8 @@ void load_launch_info(debruijn_config &cfg, boost::property_tree::ptree const &p
load(cfg.additional_contigs, pt, "additional_contigs");
INFO("Additional contigs is " << cfg.additional_contigs);
load(cfg.taxonomy, pt, "taxonomy");
load(cfg.rr_enable, pt, "rr_enable");
load(cfg.buffer_size, pt, "buffer_size");
......
......@@ -437,6 +437,8 @@ struct debruijn_config {
std::string load_from;
std::string entry_point;
bool taxonomy;
bool rr_enable;
bool two_step_rr;
bool use_intermediate_contigs;
......
......@@ -136,7 +136,7 @@ inline const char* __scope_source_name() {
# define TRACE(message) /* No trace */
#endif
#define INFO(message) LOG_MSG(logging::L_INFO , message)
#define DEBUG_SAM(message) LOG_MSG(logging::L_INFO, message)
#define START_BANNER(description) \
do { \
INFO("Starting " description ", built from " \
......
......@@ -409,12 +409,14 @@ namespace debruijn_graph {
mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper);
INFO("Barcode index construction finished.");
FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g);
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges.");
//test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("Wrote taxonomy_of_edges.fasta in Kmer dir.")
if (cfg::get().taxonomy) {
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges.");
//test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("Wrote taxonomy_of_edges.fasta in Kmer dir.")
}
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold)));
ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper,
......
......@@ -473,6 +473,13 @@ def add_input_data_args(pgroup_input_data):
help=argparse.SUPPRESS,
action="store_const")
pgroup_input_data.add_argument("--taxonomy",
dest="taxonomy",
default=False,
help="Utilises added reads taxonomy during read cloud scaffolding."
if not help_hidden else argparse.SUPPRESS,
action="store_true")
pgroup_input_data.add_argument("--ss-rf",
dest="strand_specificity",
const="rf",
......@@ -976,6 +983,11 @@ def postprocessing(args, cfg, dataset_data, log, spades_home, load_processed_dat
"(optionally accompanied by a single library of "
"TSLR-contigs, or PacBio reads, or Nanopore reads) in metaSPAdes mode!")
if options_storage.args.taxonomy and len(support.get_lib_ids_by_type(dataset_data, ["clouds10x"])) < 1:
support.error("Option --taxonomy currently requires linked reads to work (--gemcode).")
if options_storage.args.taxonomy and not options_storage.args.meta:
support.error("Option --taxonomy requires metagenomics mode (--meta).")
if existing_dataset_data is None:
with open(args.dataset_yaml_filename, 'w') as f:
pyyaml.dump(dataset_data, f,
......
......@@ -56,6 +56,8 @@ def prepare_config_spades(filename, cfg, log, additional_contigs_fname, K, stage
subst_dict["load_from"] = saves_dir
if "checkpoints" in cfg.__dict__:
subst_dict["checkpoints"] = cfg.checkpoints
if last_one and options_storage.args.taxonomy:
subst_dict["taxonomy"] = bool_to_str(options_storage.args.taxonomy)
subst_dict["developer_mode"] = bool_to_str(cfg.developer_mode)
subst_dict["gap_closer_enable"] = bool_to_str(last_one or K >= options_storage.GAP_CLOSER_ENABLE_MIN_K)
subst_dict["rr_enable"] = bool_to_str(last_one and cfg.rr_enable)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment