Commit cb555c2d authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. final contig taxa assignment (fuzzy_lca), parameter testing module & fixes



barcode_index.hpp:
- Improved robustness of some functions (error handling)
- Added taxid_distribution serialisation

barcode_index_construction.cpp & hpp:
- Re-implemented taxonomy assignment with fuzzy_lca
- Some cleaner code in taxonomy assignment
- Added parameter testing module
- Bugfixes, header file updated

read_cloud_connection_conditions.cpp & hpp:
- Code cleanup (remove duplicated logic)

construction_callers & scaffold_graph_construction_pipeline:
- Nothing (just code that was commented out again)
Signed-off-by: Overduin, Sam's avatarOverduin, Sam <sam.overduin@wur.nl>
parent c8303759
......@@ -104,7 +104,13 @@ namespace barcode_index {
} else {
taxid_str = taxatree;
}
return std::stoi(taxid_str);
try{
return std::stoi(taxid_str);
}
catch (const std::exception & ex) {
INFO("ERROR: stoi threw exception: " << ex.what() << " With input: " << taxatree);
return 0;
}
}
std::vector<TaxId> ToTaxaTreeVector(const std::string& taxa_tree_string, const char sep='.') const {
......@@ -127,7 +133,7 @@ namespace barcode_index {
}
string GetTaxaTree (const TaxId& taxid) const {
VERIFY_MSG(codes_rev_.find(taxid) != codes_rev_.end(), "In GetTaxaTree: taxid not in codes_rev");
VERIFY_MSG(codes_rev_.find(taxid) != codes_rev_.end(), "In GetTaxaTree: taxid not in codes_rev: " << taxid);
return codes_rev_.at(taxid);
}
......@@ -691,11 +697,13 @@ namespace barcode_index {
virtual void BinRead(std::istream &str) {
taxonomy_ = io::binary::BinRead<uint64_t>(str);
barcode_distribution_ = io::binary::BinRead<barcode_distribution_t>(str);
taxid_distribution_ = io::binary::BinRead<taxid_distribution_t>(str);
}
virtual void BinWrite(std::ostream &str) const {
io::binary::BinWrite(str, taxonomy_);
io::binary::BinWrite(str, barcode_distribution_);
io::binary::BinWrite(str, taxid_distribution_);
}
protected:
......@@ -854,14 +862,22 @@ namespace barcode_index {
taxonomy_ = BinRead<uint64_t>(str); //only need to save taxonomy, not taxonomy_distribution
barcode_distribution_.clear();
size_t size;
taxid_distribution_.clear();
size_t size, taxid_size;
BinRead(str, size);
BinRead(str, taxid_size);
for (size_t i = 0; i < size; ++i) {
BarcodeId barcode;
FrameBarcodeInfo info(number_of_frames_);
BinRead(str, barcode, info);
barcode_distribution_.insert({std::move(barcode), std::move(info)});
}
for (size_t i = 0; i < taxid_size; ++i) {
TaxId taxid;
FrameBarcodeInfo info(number_of_frames_);
BinRead(str, taxid, info);
taxid_distribution_.insert({std::move(taxid), std::move(info)});
}
}
void BinWrite(std::ostream &str) const override {
......@@ -872,10 +888,15 @@ namespace barcode_index {
BinWrite(str, taxonomy_);
size_t size = barcode_distribution_.size();
size_t taxid_size = taxid_distribution_.size();
BinWrite(str, size);
BinWrite(str, taxid_size);
for (const auto &entry : barcode_distribution_) {
BinWrite(str, entry.first, entry.second);
}
for (const auto &entry : taxid_distribution_) {
BinWrite(str, entry.first, entry.second);
}
}
protected:
......
......@@ -178,6 +178,8 @@ public:
private:
const Graph &g_;
//barcode_index::FrameBarcodeIndexInfoExtractor non_point_barcode_extractor_;
//std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> old_barcode_extractor_;
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_;
std::size_t max_threads_;
};
......
......@@ -116,54 +116,42 @@ TaxaBreakPredicate::TaxaBreakPredicate(
: g_(g_),
barcode_extractor_(barcode_extractor) {}
bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
//barcode_extractor_. ;
static const std::string null_taxa = "0";
auto first = scaffold_edge.getStart();
auto second = scaffold_edge.getEnd();
DEBUG_SAM("In TaxaBreakPredicate, Edge_id: " << scaffold_edge.getId() << " length: " << scaffold_edge.getLength());
auto seq1 = first.GetSequence(g_);
auto seq2 = second.GetSequence(g_);
DEBUG_SAM("Vertex_1_id: " << first.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(first.int_id()) << " Length: " << seq1.size());
DEBUG_SAM("Vertex_2_id: " << second.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()) << " Length: " << seq2.size());
std::string taxatree_1;
EdgeId edge_1 = first.int_id();
if (barcode_extractor_->EdgeIsValid(edge_1)) {
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(edge_1);
std::string TaxaBreakPredicate::GetTaxaTree(const scaffold_graph::ScaffoldVertex &scaff_vertex) const {
std::string taxatree;
EdgeId edge_int = scaff_vertex.int_id();
if (barcode_extractor_->EdgeIsValid(edge_int)) {
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(edge_int);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_1))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_1));
else if (barcode_extractor_->EdgeIsValid(scaff_vertex.GetConjugateFromGraph(g_).int_id())) {
DEBUG("No taxatree for vector: " << edge_int << ", conjugate was fine.");
taxatree = barcode_extractor_->GetTaxaTreeFromEdge(scaff_vertex.GetConjugateFromGraph(g_).int_id());
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!!");
taxatree_1 = "0";
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!! Vertex:" << edge_int);
taxatree = "0";
DEBUG("Vertex_1_id: " << edge_int << " Taxonomy: " << taxatree << " Length: "
<< barcode_extractor_->GetEdgeLength(edge_int));
}
return taxatree;
}
std::string taxatree_2;
EdgeId edge_2 = second.int_id();
if (barcode_extractor_->EdgeIsValid(edge_2)) {
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(edge_2);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_2))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_2));
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! Should not happen! ERROR!!!");
taxatree_2 = "0";
}
bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
static const std::string null_taxa = "0";
auto first = scaffold_edge.getStart();
auto second = scaffold_edge.getEnd();
std::string taxatree_1 = GetTaxaTree(first);
std::string taxatree_2 = GetTaxaTree(second);
if (taxatree_1 == null_taxa || taxatree_2 == null_taxa ||
taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) {
DEBUG("Taxatree match!");
DEBUG("Taxatree match! edges: " << first.int_id() << ", " << second.int_id());
return true;
}
else {
DEBUG("Taxatree mismatch!");
DEBUG("Taxatree mismatch! edges: " << first.int_id() << ", " << second.int_id());
return false;
}
return false;
}
EdgeSplitPredicate::EdgeSplitPredicate(
......
......@@ -114,6 +114,7 @@ public:
TaxaBreakPredicate(const Graph &g_,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor);
std::string GetTaxaTree(const scaffold_graph::ScaffoldVertex &scaff_vertex) const;
bool Check(const ScaffoldEdge &scaffold_edge) const override;
private:
......
......@@ -309,18 +309,24 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> MergingSca
const double EDGE_LENGTH_FRACTION = 0.5;
auto fraction_tail_threshold_getter =
std::make_shared<barcode_index::FractionTailThresholdGetter>(g_, EDGE_LENGTH_FRACTION);
//DEBUG_SAM("1: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
auto split_scaffold_vertex_index = helper.ConstructScaffoldVertexIndex(g_, *barcode_extractor_,
fraction_tail_threshold_getter,
count_threshold, length_threshold,
max_threads_, scaffold_vertices);
//DEBUG_SAM("2: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
auto split_scaffold_index_extractor =
std::make_shared<barcode_index::SimpleScaffoldVertexIndexInfoExtractor>(split_scaffold_vertex_index);
//DEBUG_SAM("3: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
iterative_constructor_callers.push_back(std::make_shared<EdgeSplitConstructorCaller>(g_,
split_scaffold_index_extractor,
max_threads_));
//DEBUG_SAM("4: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
iterative_constructor_callers.push_back(std::make_shared<TransitiveConstructorCaller>(g_, max_threads_));
iterative_constructor_callers.push_back(
std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
//DEBUG_SAM("5: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
//iterative_constructor_callers.push_back(
// std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
//DEBUG_SAM("6: barcode_extractor_ ref count: " << barcode_extractor_.use_count());
return iterative_constructor_callers;
}
} //path_extend
......
......@@ -19,4 +19,34 @@ namespace debruijn_graph {
DECL_LOGGER("BarcodeMapConstrusctionStage")
};
std::vector<std::vector<TaxId>> lca_preprocessing(const std::vector<string>& taxatree_str_vec,
std::vector<size_t>& count_vec, size_t& null_counts, size_t& total_counts, size_t& longest_lineage,
const FrameBarcodeIndexInfoExtractor& extractor);
TaxId fuzzy_lca(const std::vector<string>& taxatree_str_vec, std::vector<size_t> count_vec,
const FrameBarcodeIndexInfoExtractor& extractor, const double& max_drilldown_ratio=0.85,
const double& max_mismatch_ratio=0.05);
TaxId majority_vote_lca(std::vector<string>& taxatree_str_vec, std::vector<size_t>& count_vec,
const FrameBarcodeIndexInfoExtractor& extractor, double maj_frac=0.5);
TaxId last_common_ancestor(std::vector<string> &taxatree_vec, std::vector<size_t> &count_vec,
const FrameBarcodeIndexInfoExtractor& extractor);
void fill_taxatree_and_count_vectors_from_edge(const FrameBarcodeIndexInfoExtractor& extractor,
std::vector<string>& taxatree_vector,
std::vector<size_t>& count_vector, EdgeId edge);
void test_various_lca_settings(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor, const string& file_name);
void write_contigs_to_fasta(const barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor, const string& file_name);
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor);
void assure_lca_taxatree_in_taxatree_codes(const std::vector<string> &taxatree_vector, const TaxId &lca,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex);
} //debruijn_graph
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment