Commit 74d6406f authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. PathVertex::GetTaxaTree, enabled LCA testing & moved TaxaBreak forwards



scaffold_vertex.hpp & cpp:
- Made GetTaxaTree() member of PathVertex and EdgeIdVertex (both are
  ScaffoldVertexes in the contracted DeBruijnGraph)

launcher.cpp:
- Added LCA parameter testing to start of repeat resolving step

read_cloud_connection_conditions.cpp & hpp:
- Made GetTaxaTree() part of the ScaffoldVertex itself (Edge- & Path-type)

scaffold_graph_construction_pipeline.cpp:
- Moved TaxaBreakConstructor to top of pipelines for efficiency

barcode_index_construction.cpp:
- Turned LCA settings tester back on

barcode_index_construction.cpp & hpp:
- Re-implemented taxonomy assignment with fuzzy_lca
- Some cleaner code in taxonomy assignment
- Added parameter testing module
- Bugfixes, header file updated

read_cloud_connection_conditions.cpp & hpp:
- Code cleanup (remove duplicated logic)

construction_callers & scaffold_graph_construction_pipeline:
- Nothing (just code that was commented out again)
Signed-off-by: Overduin, Sam's avatarOverduin, Sam <sam.overduin@wur.nl>
parent cb555c2d
......@@ -80,6 +80,26 @@ std::string EdgeIdVertex::GetSequence(const debruijn_graph::Graph &g) const {
return g.EdgeNucls(edge_).str();
}
std::string EdgeIdVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_) const {
std::string taxatree;
EdgeId edge_int = this->GetId();
if (barcode_extractor_->EdgeIsValid(edge_int)) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(edge_int);
}
else if (barcode_extractor_->EdgeIsValid(this->GetConjugateFromGraph(g)->GetId())) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(this->GetConjugateFromGraph(g)->GetId());
DEBUG_SAM("No taxatree for vector: " << edge_int << ", conjugate was fine: " <<
this->GetConjugateFromGraph(g)->GetId() << " TaxaTree: " << taxatree);
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!! Vertex:" << edge_int);
taxatree = "0";
DEBUG("Vertex_1_id: " << edge_int << " Taxonomy: " << taxatree << " Length: "
<< barcode_extractor_->GetEdgeLength(edge_int));
}
return taxatree;
}
size_t EdgeIdVertex::GetSize() const {
return 1;
}
......@@ -169,6 +189,68 @@ size_t PathVertex::GetSize() const {
return path_->Size();
}
// Consider changing below to more of an LCA type deal with drilldowns etc.
std::string PathVertex::TaxaTreeFromCountMap(std::unordered_map<std::string, size_t> &taxatree_counts) const {
double frac_match = 0.9;
std::string taxatree;
size_t total_counts = 0;
size_t common_taxa_count = 0;
for (const auto &entry: taxatree_counts) {
total_counts += entry.second;
if (entry.second > common_taxa_count) {
common_taxa_count = entry.second;
taxatree = entry.first;
}
}
size_t max_mismatches = frac_match * total_counts;
size_t mismatches = 0;
for (const auto &entry: taxatree_counts) {
if (entry.first == "0")
continue;
if (entry.first != taxatree) {
DEBUG("Path TaxaTree mismatch! main taxatree: " << taxatree << " other_taxatree: " << entry.first);
mismatches += entry.second;
}
if (mismatches > max_mismatches) {
DEBUG_SAM("Path: " << this->GetId() << " contains fully mismatching taxatrees:");
for (const auto &entry2: taxatree_counts) {
DEBUG_SAM("Taxatree: " << entry2.first << " Count: " << entry2.second);
}
return "0";
}
}
return taxatree;
}
std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_) const {
TRACE("Extracting taxatree from path");
std::unordered_map<std::string, size_t> taxatree_counts;
for (const auto &edge: *path_) {
std::string taxatree;
EdgeId edge_int = edge.int_id();
if (barcode_extractor_->EdgeIsValid(edge_int)) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(edge_int);
}
else if (barcode_extractor_->EdgeIsValid(g.conjugate(edge_int))) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(g.conjugate(edge_int));
DEBUG_SAM("No taxatree for vector: " << edge_int << ", conjugate was fine: " <<
g.conjugate(edge_int).int_id() << " TaxaTree: " << taxatree);
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!! Vertex:" << edge_int);
taxatree = "0";
DEBUG("Vertex_1_id: " << edge_int << " Taxonomy: " << taxatree << " Length: "
<< barcode_extractor_->GetEdgeLength(edge_int));
}
++taxatree_counts[taxatree];
}
std::string taxatree_result = TaxaTreeFromCountMap(taxatree_counts);
return taxatree_result;
}
ScaffoldVertex::ScaffoldVertex(std::shared_ptr<InnerScaffoldVertex> vertex_ptr_) : vertex_ptr_(vertex_ptr_) {}
size_t ScaffoldVertex::int_id() const {
return vertex_ptr_->GetId();
......@@ -244,6 +326,9 @@ std::unordered_set<EdgeId> ScaffoldVertex::GetAllEdges() const {
std::string ScaffoldVertex::GetSequence(const debruijn_graph::Graph &g) const {
return vertex_ptr_->GetSequence(g);
}
std::string ScaffoldVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor) const {
return vertex_ptr_->GetTaxaTree(g, barcode_extractor);
}
size_t ScaffoldVertex::GetSize() const {
return vertex_ptr_->GetSize();
}
......
......@@ -8,6 +8,7 @@
#include "assembly_graph/paths/bidirectional_path.hpp"
#include "assembly_graph/core/graph.hpp"
#include "common/barcode_index/barcode_info_extractor.hpp"
#include "boost/optional/optional.hpp"
......@@ -31,6 +32,7 @@ class InnerScaffoldVertex {
virtual boost::optional<EdgeId> GetLastEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const = 0;
virtual boost::optional<EdgeId> GetFirstEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const = 0;
virtual std::string GetSequence(const debruijn_graph::Graph &g) const = 0;
virtual std::string GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor) const = 0;
virtual size_t GetSize() const = 0;
virtual debruijn_graph::EdgeId GetLastEdge() const = 0;
......@@ -59,6 +61,7 @@ class EdgeIdVertex : public InnerScaffoldVertex {
boost::optional<EdgeId> GetLastEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const override;
boost::optional<EdgeId> GetFirstEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const override;
std::string GetSequence(const debruijn_graph::Graph &g) const override;
std::string GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor) const override;
size_t GetSize() const override;
EdgeId GetLastEdge() const override;
......@@ -75,6 +78,7 @@ class EdgeIdVertex : public InnerScaffoldVertex {
class PathVertex : public InnerScaffoldVertex {
private:
path_extend::BidirectionalPath *path_;
std::string TaxaTreeFromCountMap(std::unordered_map<std::string, size_t> &taxatree_counts) const;
public:
explicit PathVertex(path_extend::BidirectionalPath *path_);
......@@ -89,6 +93,7 @@ class PathVertex : public InnerScaffoldVertex {
boost::optional<EdgeId> GetLastEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const override;
boost::optional<EdgeId> GetFirstEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const override;
std::string GetSequence(const debruijn_graph::Graph &g) const override;
std::string GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor) const;
size_t GetSize() const override;
EdgeId GetLastEdge() const override;
......@@ -128,6 +133,7 @@ class ScaffoldVertex {
boost::optional<EdgeId> GetLastEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const;
boost::optional<EdgeId> GetFirstEdgeWithPredicate(const func::TypedPredicate<EdgeId> &pred) const;
std::string GetSequence(const debruijn_graph::Graph &g) const;
std::string GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor) const;
size_t GetSize() const;
EdgeId GetLastEdge() const;
......
......@@ -20,6 +20,7 @@
#include "modules/path_extend/read_cloud_path_extend/fragment_statistics/secondary_stats_estimators.hpp"
#include "modules/path_extend/read_cloud_path_extend/scaffold_graph_construction/scaffold_graph_storage_constructor.hpp"
#include "modules/path_extend/read_cloud_path_extend/statistics/path_scaffolder_analyzer.hpp"
#include "projects/spades/barcode_index_construction.hpp"
namespace path_extend {
......@@ -587,6 +588,11 @@ void PathExtendLauncher::ScaffoldPaths(PathContainer &paths) const {
}
void PathExtendLauncher::Launch() {
INFO("Making LCA test parameter file");
FrameBarcodeIndexInfoExtractor extractor(gp_.barcode_mapper, gp_.g);
test_various_lca_settings(gp_.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(gp_.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("ExSPAnder repeat resolving tool started");
fs::make_dir(params_.output_dir);
fs::make_dir(params_.etc_dir);
......
......@@ -116,32 +116,13 @@ TaxaBreakPredicate::TaxaBreakPredicate(
: g_(g_),
barcode_extractor_(barcode_extractor) {}
std::string TaxaBreakPredicate::GetTaxaTree(const scaffold_graph::ScaffoldVertex &scaff_vertex) const {
std::string taxatree;
EdgeId edge_int = scaff_vertex.int_id();
if (barcode_extractor_->EdgeIsValid(edge_int)) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(edge_int);
}
else if (barcode_extractor_->EdgeIsValid(scaff_vertex.GetConjugateFromGraph(g_).int_id())) {
DEBUG("No taxatree for vector: " << edge_int << ", conjugate was fine.");
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(scaff_vertex.GetConjugateFromGraph(g_).int_id());
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!! Vertex:" << edge_int);
taxatree = "0";
DEBUG("Vertex_1_id: " << edge_int << " Taxonomy: " << taxatree << " Length: "
<< barcode_extractor_->GetEdgeLength(edge_int));
}
return taxatree;
}
bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
static const std::string null_taxa = "0";
auto first = scaffold_edge.getStart();
auto second = scaffold_edge.getEnd();
std::string taxatree_1 = GetTaxaTree(first);
std::string taxatree_2 = GetTaxaTree(second);
std::string taxatree_1 = first.GetTaxaTree(g_, barcode_extractor_);
std::string taxatree_2 = second.GetTaxaTree(g_, barcode_extractor_);
if (taxatree_1 == null_taxa || taxatree_2 == null_taxa ||
taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) {
......
......@@ -114,7 +114,6 @@ public:
TaxaBreakPredicate(const Graph &g_,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor);
std::string GetTaxaTree(const scaffold_graph::ScaffoldVertex &scaff_vertex) const;
bool Check(const ScaffoldEdge &scaffold_edge) const override;
private:
......
......@@ -230,6 +230,7 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> FullScaffo
ConstructSimpleEdgeIndex(scaffold_vertices, barcode_extractor_, params, max_threads_);
std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> iterative_constructor_callers;
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<BarcodeScoreConstructorCaller>(gp_.g, barcode_extractor_,
scaffold_index_extractor,
max_threads_));
......@@ -242,8 +243,6 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> FullScaffo
unique_storage_, search_parameter_pack_,
read_cloud_configs_.scaff_con, max_threads_,
scaffolding_mode));
iterative_constructor_callers.push_back(
std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
const size_t min_pipeline_length = read_cloud_configs_.long_edge_length_lower_bound;
bool launch_full_pipeline = min_length_ > min_pipeline_length;
......@@ -309,24 +308,17 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> MergingSca
const double EDGE_LENGTH_FRACTION = 0.5;
auto fraction_tail_threshold_getter =
std::make_shared<barcode_index::FractionTailThresholdGetter>(g_, EDGE_LENGTH_FRACTION);
//DEBUG_SAM("1: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
auto split_scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, *barcode_extractor_,
fraction_tail_threshold_getter,
count_threshold, length_threshold,
max_threads_, scaffold_vertices);
//DEBUG_SAM("2: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
auto split_scaffold_index_extractor =
std::make_shared<barcode_index::SimpleScaffoldVertexIndexInfoExtractor>(split_scaffold_vertex_index);
//DEBUG_SAM("3: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<EdgeSplitConstructorCaller>(g_,
split_scaffold_index_extractor,
max_threads_));
//DEBUG_SAM("4: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
iterative_constructor_callers.push_back(std::make_shared<TransitiveConstructorCaller>(g_, max_threads_));
//DEBUG_SAM("5: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
//iterative_constructor_callers.push_back(
// std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
//DEBUG_SAM("6: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
return iterative_constructor_callers;
}
} //path_extend
......
......@@ -370,7 +370,7 @@ namespace debruijn_graph {
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges.");
//test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("Wrote taxonomy_of_edges.fasta in Kmer dir.")
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment