Commit a3d9211c authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. LCA for PathVertex::GetTaxaTree, TaxaBreak during scaffolding.



scaffold_vertex.hpp & cpp:
- Used Fuzzy LCA for PathVertex::GetTaxaTree with max_drilldown_ratio=0.4,
  max_mismatch_ratio=0.03 and min_assignment_ratio=0.3. This code is in
  barcode_index_construction.cpp

barcode_index.hpp & barcode_index_builder.hpp:
- added code to deal with "_SUBSTR(0,106)" at end of read names (influenced
 taxatrees).

launcher.cpp:
- Removed LCA parameter testing to start of repeat resolving step

read_cloud_connection_conditions.cpp & hpp:
- Output which edge connections (or path connections) are being broken in
  logfile.

scaffold_graph_construction_pipeline.cpp:
- Moved TaxaBreakConstructor to bottom of pipelines for more informative
  breaks.

path_scaffolder.cpp:
- map.at() replaced with map[] if item not within map. Added debuggin msg.

barcode_index_construction.cpp:
- Added function to get LCA for PathScaffoldVertex (more conservative than
  reads-> contig LCA).
- Started prototype code for taxonomy assignment per edges (eg. using CAT)
Signed-off-by: Overduin, Sam's avatarOverduin, Sam <sam.overduin@wur.nl>
parent 74d6406f
......@@ -7,6 +7,7 @@
#include "scaffold_vertex.hpp"
#include "assembly_graph/paths/bidirectional_path_io/io_support.hpp"
#include "projects/spades/barcode_index_construction.hpp"
namespace scaffold_graph {
......@@ -190,6 +191,7 @@ size_t PathVertex::GetSize() const {
}
// Consider changing below to more of an LCA type deal with drilldowns etc.
// TODO: delete this function, now done LCA with more stringent parameters than reads -> contig LCA
std::string PathVertex::TaxaTreeFromCountMap(std::unordered_map<std::string, size_t> &taxatree_counts) const {
double frac_match = 0.9;
std::string taxatree;
......@@ -224,7 +226,8 @@ std::string PathVertex::TaxaTreeFromCountMap(std::unordered_map<std::string, siz
return taxatree;
}
std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_) const {
std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_) const {
TRACE("Extracting taxatree from path");
std::unordered_map<std::string, size_t> taxatree_counts;
for (const auto &edge: *path_) {
......@@ -247,7 +250,8 @@ std::string PathVertex::GetTaxaTree(const debruijn_graph::Graph &g, std::shared_
++taxatree_counts[taxatree];
}
std::string taxatree_result = TaxaTreeFromCountMap(taxatree_counts);
//std::string taxatree_result = TaxaTreeFromCountMap(taxatree_counts);
std::string taxatree_result = debruijn_graph::lca_taxatree_from_map(taxatree_counts, *barcode_extractor_);
return taxatree_result;
}
......
......@@ -90,6 +90,9 @@ namespace barcode_index {
void AddTaxaTree(string &taxatree) {
auto it = codes_.find(taxatree);
if (it == codes_.end()) {
if (taxatree.find("SUBSTR") != std::string::npos) {
INFO("SUBSTR found in AddTaxaTree: " << taxatree);
}
TaxId taxid = ParseTaxId(taxatree);
codes_[taxatree] = taxid;
codes_rev_[taxid] = taxatree;
......
......@@ -202,13 +202,18 @@ namespace barcode_index {
return taxatree;
}
}
return "";
return "0";
}
string GetTaxaTreeFromStartPos(const size_t start_pos, const string& read_id) {
static char underscore = '_';
// Can't be underscore, after error correction sometimes reads like:
// K00374:82:hkjyvbbxx:5:2228:2595:37185 BH:ok TaxaTree:1.131567.2.1783272.1239.91061.1385.186817.1386_SUBSTR(0,106)
// no idea why
string result = "";
DEBUG("Start_param: " << read_id << "|" << start_pos);
for (auto it = read_id.begin() + start_pos; it != read_id.end(); ++it) {
if (std::isspace(*it)) {
if (std::isspace(*it) || (*it) == underscore) {
return result;
}
result.push_back(*it);
......
......@@ -29,7 +29,12 @@ void SimplePathScaffolder::CondenseSimplePaths(const std::vector<ScaffoldEdge> &
if (merge_connections.find(end_conjugate) == merge_connections.end() or
merge_connections.at(end_conjugate) != start_conjugate) {
WARN("Conjugate connection does not correspond to direct connection")
merge_connections.at(end_conjugate) = start_conjugate;
if (merge_connections.find(end_conjugate) == merge_connections.end()) {
DEBUG_SAM("Adding conjugates to merge connection");
merge_connections[end_conjugate] = start_conjugate;
} else {
merge_connections.at(end_conjugate) = start_conjugate;
}
} else {
merge_connections.insert({end_conjugate, start_conjugate});
}
......
......@@ -588,9 +588,9 @@ void PathExtendLauncher::ScaffoldPaths(PathContainer &paths) const {
}
void PathExtendLauncher::Launch() {
INFO("Making LCA test parameter file");
INFO("Making LCA edges file");
FrameBarcodeIndexInfoExtractor extractor(gp_.barcode_mapper, gp_.g);
test_various_lca_settings(gp_.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
// test_various_lca_settings(gp_.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(gp_.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("ExSPAnder repeat resolving tool started");
......
......@@ -9,6 +9,7 @@
#include "read_cloud_dijkstras.hpp"
#include "modules/path_extend/pipeline/launcher.hpp"
namespace path_extend {
namespace read_cloud {
......@@ -114,7 +115,28 @@ TaxaBreakPredicate::TaxaBreakPredicate(
const Graph &g_,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor)
: g_(g_),
barcode_extractor_(barcode_extractor) {}
barcode_extractor_(barcode_extractor) {
// implement thread safe output file for writing broken edges
// https://stackoverflow.com/questions/33596910/c-how-to-have-multiple-threads-write-to-a-file
// string out_location = fs::append_path(cfg::get().output_dir, "broken_edge_connections.txt");
// out_lca_file_.open(out_location);
// out_lca_file_ptr_ = &out_lca_file_;
// // std::make_shared<std::ofstream>(out_lca_file_);
// out_lca_file_ << "Starting TaxaBreakPredicate." << std::endl;
}
string TaxaBreakPredicate::GetEdgesString(const ScaffoldGraph::ScaffoldGraphVertex &scaff_vertex) const {
string edges_str = "";
std::unordered_set<EdgeId> edges_lst = scaff_vertex.GetAllEdges();
auto it = edges_lst.begin();
edges_str += std::to_string(it->int_id());
++it;
while (it != edges_lst.end()){
edges_str += ";" + std::to_string(it->int_id());
++it;
}
return edges_str;
}
bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
static const std::string null_taxa = "0";
......@@ -126,11 +148,16 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
if (taxatree_1 == null_taxa || taxatree_2 == null_taxa ||
taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) {
DEBUG("Taxatree match! edges: " << first.int_id() << ", " << second.int_id());
DEBUG("Taxatree match! edges: " << GetEdgesString(first) << " and " << GetEdgesString(second));
return true;
}
else {
DEBUG("Taxatree mismatch! edges: " << first.int_id() << ", " << second.int_id());
DEBUG("Taxatree mismatch! edges: " << GetEdgesString(first) << " and " << GetEdgesString(second));
string out_string = "Broke: " + GetEdgesString(first) + " and " + GetEdgesString(second)+ "\n";
INFO(out_string);
//(*const_cast<std::ofstream*>(out_lca_file_ptr_)) << out_string;
//https://stackoverflow.com/questions/33596910/c-how-to-have-multiple-threads-write-to-a-file
//(*std::const_pointer_cast<const std::ofstream>(out_lca_file_ptr_)) << "Broke: " << GetEdgesString(first) << " and " << GetEdgesString(second) << std::endl;
return false;
}
}
......
......@@ -113,12 +113,15 @@ public:
typedef barcode_index::BarcodeId BarcodeId;
TaxaBreakPredicate(const Graph &g_,
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor);
string GetEdgesString(const ScaffoldGraph::ScaffoldGraphVertex &scaff_vertex) const;
bool Check(const ScaffoldEdge &scaffold_edge) const override;
private:
const Graph &g_;
std::shared_ptr<barcode_index::FrameBarcodeIndexInfoExtractor> barcode_extractor_;
std::ofstream out_lca_file_;
//std::ofstream *out_lca_file_ptr_helper_ = &out_lca_file_;
std::ofstream *out_lca_file_ptr_;
DECL_LOGGER("TaxaBreakPredicate");
};
......
......@@ -230,7 +230,7 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> FullScaffo
ConstructSimpleEdgeIndex(scaffold_vertices, barcode_extractor_, params, max_threads_);
std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> iterative_constructor_callers;
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<BarcodeScoreConstructorCaller>(gp_.g, barcode_extractor_,
scaffold_index_extractor,
max_threads_));
......@@ -243,7 +243,7 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> FullScaffo
unique_storage_, search_parameter_pack_,
read_cloud_configs_.scaff_con, max_threads_,
scaffolding_mode));
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(gp_.g, barcode_extractor_, max_threads_));
const size_t min_pipeline_length = read_cloud_configs_.long_edge_length_lower_bound;
bool launch_full_pipeline = min_length_ > min_pipeline_length;
......@@ -314,11 +314,12 @@ std::vector<std::shared_ptr<IterativeScaffoldGraphConstructorCaller>> MergingSca
max_threads_, scaffold_vertices);
auto split_scaffold_index_extractor =
std::make_shared<barcode_index::SimpleScaffoldVertexIndexInfoExtractor>(split_scaffold_vertex_index);
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<EdgeSplitConstructorCaller>(g_,
split_scaffold_index_extractor,
max_threads_));
iterative_constructor_callers.push_back(std::make_shared<TransitiveConstructorCaller>(g_, max_threads_));
iterative_constructor_callers.push_back(std::make_shared<TaxaBreakConstructorCaller>(g_, barcode_extractor_, max_threads_));
return iterative_constructor_callers;
}
} //path_extend
......
......@@ -19,6 +19,16 @@ namespace debruijn_graph {
return has_read_clouds;
}
//TODO: remove, not used anywhere. Maybe someday in the future
void taxatree_to_edges_from_file(std::string filename,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex) {
std::ifstream in_lca_file(filename);
std::string line;
while(std::getline(in_lca_file, line)){
//edge_1, edge_2, taxatree = parsed_info
}
}
std::vector<std::vector<TaxId>>
lca_preprocessing(const std::vector<string> &taxatree_str_vec, std::vector<size_t> &count_vec, size_t &null_counts,
size_t &total_counts, size_t &longest_lineage, const FrameBarcodeIndexInfoExtractor &extractor) {
......@@ -45,9 +55,7 @@ namespace debruijn_graph {
TaxId fuzzy_lca(const std::vector<string> &taxatree_str_vec, std::vector<size_t> count_vec,
const FrameBarcodeIndexInfoExtractor &extractor, const double &max_drilldown_ratio,
const double &max_mismatch_ratio) {
double minimum_assign_ratio = 0.2;
const double &max_mismatch_ratio, double minimum_assign_ratio) {
//max_drilldown_ratio=ratio of non-0 taxa to allow at a deeper taxid in same lineage
//max_mismatch_ratio=ratio of non-0 taxa to allow as a mismatch
......@@ -188,6 +196,21 @@ namespace debruijn_graph {
return lca;
}
std::string lca_taxatree_from_map(std::unordered_map<std::string, size_t> &taxatree_map,
const FrameBarcodeIndexInfoExtractor &extractor) {
// wrapper to change underlying lca_algorithm.
std::vector<string> taxatree_vec;
std::vector<size_t> count_vec;
for (auto it = taxatree_map.begin(); it != taxatree_map.end(); it++) {
taxatree_vec.push_back(it->first);
count_vec.push_back(it->second);
}
TaxId lca = fuzzy_lca(taxatree_vec, count_vec, extractor, 0.4, 0.03, 0.3);
std::string taxatree = taxatree_from_lca_and_vec(taxatree_vec, lca);
return taxatree;
}
void fill_taxatree_and_count_vectors_from_edge(const FrameBarcodeIndexInfoExtractor &extractor,
std::vector<string> &taxatree_vector,
std::vector<size_t> &count_vector, EdgeId edge) {
......@@ -199,6 +222,9 @@ namespace debruijn_graph {
taxid_it++ ) {
taxid = taxid_it->first;
taxatree = extractor.TaxaTreeFromTaxId(taxid);
if (taxatree.find("SUBSTR") != std::string::npos) {
INFO("SUBSTR found in fill: " << taxatree);
}
auto taxatree_match = find(taxatree_vector.begin(), taxatree_vector.end(), taxatree);
if (taxatree_match == taxatree_vector.end()) {
taxatree_vector.push_back(taxatree);
......@@ -212,18 +238,34 @@ namespace debruijn_graph {
}
}
void assure_lca_taxatree_in_taxatree_codes(const std::vector<string> &taxatree_vector, const TaxId &lca,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex) {
std::string taxatree_from_lca_and_vec(const std::vector<string> &taxatree_vector, const TaxId &lca) {
//Get taxatree for LCA
string lca_taxatree;
string lca_str = std::to_string(lca);
size_t lca_pos;
if (lca_str == "0") return lca_str;
for (const string& taxatree : taxatree_vector) {
size_t lca_pos = taxatree.find(lca_str);
lca_pos = taxatree.find(lca_str);
if (lca_pos != string::npos){
lca_taxatree = taxatree.substr(0, lca_pos + lca_str.size());
break;
}
}
return lca_taxatree;
}
void assure_lca_taxatree_in_taxatree_codes(const std::vector<string> &taxatree_vector, const TaxId &lca,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex) {
//Get taxatree for LCA
string lca_taxatree = taxatree_from_lca_and_vec(taxatree_vector, lca);
if (lca_taxatree == "0") return;
if (lca_taxatree.find("SUBSTR") != std::string::npos) {
INFO("SUBSTR found in assure: " << lca_taxatree << " lca_taxatree: " << lca_taxatree);
}
//INFO("Inserting taxatree: " << lca_taxatree << " lca_pos: " << lca_pos << " size: " << lca_str.size());
//Insert lca_taxatree into taxatree_codes_ otherwise
barcodeindex.taxatree_codes_.AddTaxaTree(lca_taxatree);
}
......@@ -370,7 +412,7 @@ namespace debruijn_graph {
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges.");
test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
//test_various_lca_settings(graph_pack.barcode_mapper, extractor, "taxonomy_edges_tests.fasta");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_of_edges.fasta");
INFO("Wrote taxonomy_of_edges.fasta in Kmer dir.")
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
......
......@@ -19,13 +19,16 @@ namespace debruijn_graph {
DECL_LOGGER("BarcodeMapConstrusctionStage")
};
void taxatree_to_edges_from_file(string filename,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex);
std::vector<std::vector<TaxId>> lca_preprocessing(const std::vector<string>& taxatree_str_vec,
std::vector<size_t>& count_vec, size_t& null_counts, size_t& total_counts, size_t& longest_lineage,
const FrameBarcodeIndexInfoExtractor& extractor);
TaxId fuzzy_lca(const std::vector<string>& taxatree_str_vec, std::vector<size_t> count_vec,
const FrameBarcodeIndexInfoExtractor& extractor, const double& max_drilldown_ratio=0.85,
const double& max_mismatch_ratio=0.05);
const double& max_mismatch_ratio=0.05, double minimum_assign_ratio=0.2);
TaxId majority_vote_lca(std::vector<string>& taxatree_str_vec, std::vector<size_t>& count_vec,
const FrameBarcodeIndexInfoExtractor& extractor, double maj_frac=0.5);
......@@ -33,6 +36,9 @@ namespace debruijn_graph {
TaxId last_common_ancestor(std::vector<string> &taxatree_vec, std::vector<size_t> &count_vec,
const FrameBarcodeIndexInfoExtractor& extractor);
std::string lca_taxatree_from_map(std::unordered_map<std::string, size_t> &taxatree_map,
const FrameBarcodeIndexInfoExtractor &extractor);
void fill_taxatree_and_count_vectors_from_edge(const FrameBarcodeIndexInfoExtractor& extractor,
std::vector<string>& taxatree_vector,
std::vector<size_t>& count_vector, EdgeId edge);
......@@ -46,6 +52,8 @@ namespace debruijn_graph {
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor);
std::string taxatree_from_lca_and_vec(const std::vector<string> &taxatree_vector, const TaxId &lca);
void assure_lca_taxatree_in_taxatree_codes(const std::vector<string> &taxatree_vector, const TaxId &lca,
barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph> &barcodeindex);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment