Commit c8303759 authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. extra BarcodeIndex serialisation, use conjugate edges & edges to fasta output

barcode_index.hpp:
- Implemented taxa_codes + taxonomy_ serialisation
- Made taxa_codes::map's public

barcode_info_extractor.hpp:
- Added funcs: EdgeIsValid, GetConjugateEdge, GetSequence, GetEdgeLength

barcode_index_construction.cpp:
- Added write_contigs_to_fasta for testing purposes.
- Re-implemented taxonomy assignment to use edge & conjugate.

read_cloud_connection_conditions.cpp:
- In TaxaBreakPred.: if taxonomy of edge not found, try taxonomy of
  conjugate edge. Else taxa 0.

logger.hpp:
- Created special DEBUG_SAM for personal logging. Will change at end.
parent 5b1631c0
...@@ -78,9 +78,9 @@ namespace barcode_index { ...@@ -78,9 +78,9 @@ namespace barcode_index {
}; };
class TaxIdEncoder { class TaxIdEncoder {
public:
std::unordered_map <string, TaxId> codes_; std::unordered_map <string, TaxId> codes_;
std::unordered_map <TaxId, string> codes_rev_; std::unordered_map <TaxId, string> codes_rev_;
public:
TaxIdEncoder(): TaxIdEncoder():
codes_(), codes_(),
codes_rev_() codes_rev_()
...@@ -127,7 +127,7 @@ namespace barcode_index { ...@@ -127,7 +127,7 @@ namespace barcode_index {
} }
string GetTaxaTree (const TaxId& taxid) const { string GetTaxaTree (const TaxId& taxid) const {
VERIFY(codes_rev_.find(taxid) != codes_rev_.end()); VERIFY_MSG(codes_rev_.find(taxid) != codes_rev_.end(), "In GetTaxaTree: taxid not in codes_rev");
return codes_rev_.at(taxid); return codes_rev_.at(taxid);
} }
...@@ -277,21 +277,43 @@ namespace barcode_index { ...@@ -277,21 +277,43 @@ namespace barcode_index {
using io::binary::BinRead; using io::binary::BinRead;
edge_to_entry_.clear(); edge_to_entry_.clear();
size_t size; size_t edge_to_entry_size;
BinRead(str, size); BinRead(str, edge_to_entry_size);
for (size_t i = 0; i < size; ++i) { size_t taxa_codes_size;
BinRead(str, taxa_codes_size);
size_t taxa_rev_codes_size;
BinRead(str, taxa_rev_codes_size);
for (size_t i = 0; i < edge_to_entry_size; ++i) {
EdgeId edge_id = BinRead<uint64_t>(str); EdgeId edge_id = BinRead<uint64_t>(str);
auto entry = BinRead<EdgeEntryT>(str); auto entry = BinRead<EdgeEntryT>(str);
edge_to_entry_.insert({std::move(edge_id), std::move(entry)}); edge_to_entry_.insert({std::move(edge_id), std::move(entry)});
} }
for (size_t i = 0; i < taxa_codes_size; ++i) {
string taxatree = BinRead<string>(str);
TaxId taxid = BinRead<uint64_t>(str);
taxatree_codes_.codes_.insert({std::move(taxatree), std::move(taxid)});
}
for (size_t i = 0; i < taxa_rev_codes_size; ++i) {
TaxId taxid = BinRead<uint64_t>(str);
string taxatree = BinRead<string>(str);
taxatree_codes_.codes_rev_.insert({std::move(taxid), std::move(taxatree)});
}
} }
virtual void BinWrite(std::ostream &str) const { virtual void BinWrite(std::ostream &str) const {
using io::binary::BinWrite; using io::binary::BinWrite;
BinWrite(str, edge_to_entry_.size()); BinWrite(str, edge_to_entry_.size());
BinWrite(str, taxatree_codes_.codes_.size());
BinWrite(str, taxatree_codes_.codes_rev_.size());
for (const auto &edge_and_entry: edge_to_entry_) { for (const auto &edge_and_entry: edge_to_entry_) {
BinWrite(str, edge_and_entry.first.int_id(), edge_and_entry.second); BinWrite(str, edge_and_entry.first.int_id(), edge_and_entry.second);
} }
for (const auto &taxatree_codes_entry: taxatree_codes_.codes_) {
BinWrite(str, taxatree_codes_entry.first, taxatree_codes_entry.second);
}
for (const auto &taxatree_codes_rev_entry: taxatree_codes_.codes_rev_) {
BinWrite(str, taxatree_codes_rev_entry.first, taxatree_codes_rev_entry.second);
}
} }
typename barcode_map_t::const_iterator GetEntryTailsIterator(const EdgeId& edge) const { typename barcode_map_t::const_iterator GetEntryTailsIterator(const EdgeId& edge) const {
...@@ -667,10 +689,12 @@ namespace barcode_index { ...@@ -667,10 +689,12 @@ namespace barcode_index {
} }
virtual void BinRead(std::istream &str) { virtual void BinRead(std::istream &str) {
taxonomy_ = io::binary::BinRead<uint64_t>(str);
barcode_distribution_ = io::binary::BinRead<barcode_distribution_t>(str); barcode_distribution_ = io::binary::BinRead<barcode_distribution_t>(str);
} }
virtual void BinWrite(std::ostream &str) const { virtual void BinWrite(std::ostream &str) const {
io::binary::BinWrite(str, taxonomy_);
io::binary::BinWrite(str, barcode_distribution_); io::binary::BinWrite(str, barcode_distribution_);
} }
...@@ -783,6 +807,7 @@ namespace barcode_index { ...@@ -783,6 +807,7 @@ namespace barcode_index {
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_t; using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_t;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_; using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_;
using EdgeEntry<Graph, FrameBarcodeInfo>::edge_; using EdgeEntry<Graph, FrameBarcodeInfo>::edge_;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxonomy_;
size_t edge_length_; size_t edge_length_;
size_t frame_size_; size_t frame_size_;
size_t number_of_frames_; size_t number_of_frames_;
...@@ -826,6 +851,7 @@ namespace barcode_index { ...@@ -826,6 +851,7 @@ namespace barcode_index {
edge_length_ = BinRead<size_t>(str); edge_length_ = BinRead<size_t>(str);
frame_size_ = BinRead<size_t>(str); frame_size_ = BinRead<size_t>(str);
number_of_frames_ = BinRead<size_t>(str); number_of_frames_ = BinRead<size_t>(str);
taxonomy_ = BinRead<uint64_t>(str); //only need to save taxonomy, not taxonomy_distribution
barcode_distribution_.clear(); barcode_distribution_.clear();
size_t size; size_t size;
...@@ -843,6 +869,7 @@ namespace barcode_index { ...@@ -843,6 +869,7 @@ namespace barcode_index {
BinWrite(str, edge_length_); BinWrite(str, edge_length_);
BinWrite(str, frame_size_); BinWrite(str, frame_size_);
BinWrite(str, number_of_frames_); BinWrite(str, number_of_frames_);
BinWrite(str, taxonomy_);
size_t size = barcode_distribution_.size(); size_t size = barcode_distribution_.size();
BinWrite(str, size); BinWrite(str, size);
......
...@@ -148,8 +148,24 @@ namespace barcode_index { ...@@ -148,8 +148,24 @@ namespace barcode_index {
return result; return result;
} }
bool EdgeIsValid(const EdgeId& edge) const {
return (index_.edge_to_entry_.find(edge) != index_.edge_to_entry_.end());
}
EdgeId GetConjugateEdge(const EdgeId& edge) const {
return g_.conjugate(edge);
}
string GetSequence(const EdgeId& edge) const {
return g_.EdgeNucls(edge).str();
}
size_t GetEdgeLength(const EdgeId& edge) const {
return g_.EdgeNucls(edge).str().length();
}
size_t GetTaxidCount(const EdgeId& edge_id_, const TaxId& taxid) const { size_t GetTaxidCount(const EdgeId& edge_id_, const TaxId& taxid) const {
DEBUG("Get Taxid from EdgeId: " << edge_id_);
return index_.edge_to_entry_.at(edge_id_).taxid_distribution_.at(taxid).GetCount(); return index_.edge_to_entry_.at(edge_id_).taxid_distribution_.at(taxid).GetCount();
} }
...@@ -162,9 +178,11 @@ namespace barcode_index { ...@@ -162,9 +178,11 @@ namespace barcode_index {
} }
string GetTaxaTreeFromEdge(const EdgeId& edge_id_) const { string GetTaxaTreeFromEdge(const EdgeId& edge_id_) const {
//INFO("EdgeId: " << edge_id_); DEBUG("GetTaxaTreeFromEdge EdgeId: " << edge_id_);
VERIFY_MSG(index_.edge_to_entry_.find(edge_id_) != index_.edge_to_entry_.end(),
"GetTaxaTreeFromEdge: not in edge_to_entry EgdeID = "<<edge_id_);
TaxId taxid = index_.edge_to_entry_.at(edge_id_).GetTaxonomy(); TaxId taxid = index_.edge_to_entry_.at(edge_id_).GetTaxonomy();
//INFO("GetTaxonomy: "<< taxid); DEBUG("GetTaxonomy: "<< taxid);
string taxatree = TaxaTreeFromTaxId(taxid); string taxatree = TaxaTreeFromTaxId(taxid);
return taxatree; return taxatree;
} }
......
...@@ -121,13 +121,39 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const { ...@@ -121,13 +121,39 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
static const std::string null_taxa = "0"; static const std::string null_taxa = "0";
auto first = scaffold_edge.getStart(); auto first = scaffold_edge.getStart();
auto second = scaffold_edge.getEnd(); auto second = scaffold_edge.getEnd();
DEBUG("In TaxaBreakPredicate, Edge_id: " << scaffold_edge.getId() << " length: " << scaffold_edge.getLength()); DEBUG_SAM("In TaxaBreakPredicate, Edge_id: " << scaffold_edge.getId() << " length: " << scaffold_edge.getLength());
auto seq1 = first.GetSequence(g_); auto seq1 = first.GetSequence(g_);
auto seq2 = second.GetSequence(g_); auto seq2 = second.GetSequence(g_);
DEBUG("Vertex_1_id: " << first.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(first.int_id()) << " Length: " << seq1.size()); DEBUG_SAM("Vertex_1_id: " << first.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(first.int_id()) << " Length: " << seq1.size());
DEBUG("Vertex_2_id: " << second.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()) << " Length: " << seq2.size()); DEBUG_SAM("Vertex_2_id: " << second.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()) << " Length: " << seq2.size());
std::string taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(first.int_id());
std::string taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()); std::string taxatree_1;
EdgeId edge_1 = first.int_id();
if (barcode_extractor_->EdgeIsValid(edge_1)) {
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(edge_1);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_1))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_1));
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!!");
taxatree_1 = "0";
}
std::string taxatree_2;
EdgeId edge_2 = second.int_id();
if (barcode_extractor_->EdgeIsValid(edge_2)) {
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(edge_2);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_2))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_2));
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! Should not happen! ERROR!!!");
taxatree_2 = "0";
}
if (taxatree_1 == null_taxa || taxatree_2 == null_taxa || if (taxatree_1 == null_taxa || taxatree_2 == null_taxa ||
taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) { taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) {
...@@ -138,14 +164,6 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const { ...@@ -138,14 +164,6 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
DEBUG("Taxatree mismatch!"); DEBUG("Taxatree mismatch!");
} }
return false; return false;
//TODO: implement taxonomy in barcode_extractor and vertexes!!
//barcode_extractor_->
//get taxonomy for first and second
//first.
//bool result = true;
//return result;
} }
EdgeSplitPredicate::EdgeSplitPredicate( EdgeSplitPredicate::EdgeSplitPredicate(
......
...@@ -136,6 +136,7 @@ inline const char* __scope_source_name() { ...@@ -136,6 +136,7 @@ inline const char* __scope_source_name() {
# define TRACE(message) /* No trace */ # define TRACE(message) /* No trace */
#endif #endif
#define INFO(message) LOG_MSG(logging::L_INFO , message) #define INFO(message) LOG_MSG(logging::L_INFO , message)
#define DEBUG_SAM(message) LOG_MSG(logging::L_INFO, message)
#define START_BANNER(description) \ #define START_BANNER(description) \
do { \ do { \
INFO("Starting " description ", built from " \ INFO("Starting " description ", built from " \
......
...@@ -100,33 +100,81 @@ namespace debruijn_graph { ...@@ -100,33 +100,81 @@ namespace debruijn_graph {
return lca; return lca;
} }
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex, void write_contigs_to_fasta(const barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor){ const FrameBarcodeIndexInfoExtractor& extractor, const string& file_name) {
std::ofstream fasta_taxa_file;
string fasta_location = fs::append_path(cfg::get().output_dir, file_name);
fasta_taxa_file.open (fasta_location);
std::unordered_set<EdgeId> completed_edges;
for ( auto &p : barcodeindex.edge_to_entry_ ) { for ( auto &p : barcodeindex.edge_to_entry_ ) {
std::vector<string> taxatree_vector; EdgeId edge = p.first; // edgeID p.first; edgeEntry p.second
std::vector<size_t> count_vector; EdgeId conjugate_edge = extractor.GetConjugateEdge(edge);
EdgeId edge; if (completed_edges.find(edge) != completed_edges.end()) {
TaxId taxid; VERIFY_MSG(completed_edges.find(conjugate_edge) != completed_edges.end(),
std::string taxatree; "Conjugate edge should be in completed_edges along with original edge.");
size_t count; continue;
edge = p.first; // edgeID p.first; edgeEntry p.second }
typename FrameBarcodeIndexInfoExtractor::taxid_distribution_t::const_iterator taxid_it; string seq = extractor.GetSequence(edge);
for (taxid_it = extractor.taxid_iterator_begin(edge); taxid_it != extractor.taxid_iterator_end(edge); string taxatree = extractor.GetTaxaTreeFromEdge(edge);
taxid_it++ ) { TaxId taxid = barcodeindex.taxatree_codes_.ParseTaxId(taxatree);
taxid = taxid_it->first; fasta_taxa_file << ">EdgeID=" << edge << " Length=" << seq.length() << " Taxa=" << taxid << " Taxatree=" << taxatree << std::endl;
taxatree = barcodeindex.taxatree_codes_.GetTaxaTree(taxid); fasta_taxa_file << seq << std::endl;
}
}
void fill_taxatree_and_count_vectors_from_edge(const FrameBarcodeIndexInfoExtractor& extractor,
std::vector<string> taxatree_vector,
std::vector<size_t> count_vector, EdgeId edge) {
TaxId taxid;
std::string taxatree;
size_t count;
typename FrameBarcodeIndexInfoExtractor::taxid_distribution_t::const_iterator taxid_it;
for (taxid_it = extractor.taxid_iterator_begin(edge); taxid_it != extractor.taxid_iterator_end(edge);
taxid_it++ ) {
taxid = taxid_it->first;
taxatree = extractor.TaxaTreeFromTaxId(taxid);
auto taxatree_match = find(taxatree_vector.begin(), taxatree_vector.end(), taxatree);
if (taxatree_match == taxatree_vector.end()) {
taxatree_vector.push_back(taxatree); taxatree_vector.push_back(taxatree);
count = extractor.GetTaxidCount(edge, taxid); count = extractor.GetTaxidCount(edge, taxid);
count_vector.push_back(count); count_vector.push_back(count);
} }
DEBUG("EdgeId: " << edge << " Frame_size: " << p.second.GetFrameSize() << " # of frames: " << p.second.GetNumberOfFrames()); else {
size_t match_index = std::distance(taxatree_vector.begin(), taxatree_match);
count_vector[match_index] += extractor.GetTaxidCount(edge, taxid);
}
}
}
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor){
std::unordered_set<EdgeId> completed_edges;
for ( auto &p : barcodeindex.edge_to_entry_ ) {
EdgeId edge = p.first; // edgeID p.first; edgeEntry p.second
EdgeId conjugate_edge = extractor.GetConjugateEdge(edge);
if (completed_edges.find(edge) != completed_edges.end()) {
VERIFY_MSG(completed_edges.find(conjugate_edge) != completed_edges.end(),
"Conjugate edge should be in completed_edges along with original edge.");
continue;
}
std::vector<string> taxatree_vector;
std::vector<size_t> count_vector;
fill_taxatree_and_count_vectors_from_edge(extractor, taxatree_vector, count_vector, edge);
fill_taxatree_and_count_vectors_from_edge(extractor, taxatree_vector, count_vector, conjugate_edge);
DEBUG_SAM("EdgeId: " << edge << " Approximate_size: " << (p.second.GetFrameSize() * p.second.GetNumberOfFrames()) );
for (size_t i = 0; i != taxatree_vector.size(); i++ ){ for (size_t i = 0; i != taxatree_vector.size(); i++ ){
DEBUG("TaxaTree: " << taxatree_vector[i] << ", Count: " << count_vector[i]); DEBUG_SAM("TaxaTree: " << taxatree_vector[i] << ", Count: " << count_vector[i]);
} }
// Beware that lca function can mess with count_vector length so just use once. // Beware that lca function can mess with count_vector length so just use once.
TaxId lca = last_common_ancestor(taxatree_vector, count_vector, extractor); TaxId lca = last_common_ancestor(taxatree_vector, count_vector, extractor);
p.second.SetTaxonomy(lca); p.second.SetTaxonomy(lca);
DEBUG("Taxonomy: " << barcodeindex.edge_to_entry_.at(edge).GetTaxonomy()) barcodeindex.edge_to_entry_.at(conjugate_edge).SetTaxonomy(lca);
completed_edges.insert(edge);
completed_edges.insert(conjugate_edge);
DEBUG_SAM("Taxonomy: " << barcodeindex.edge_to_entry_.at(edge).GetTaxonomy());
} }
} }
...@@ -156,8 +204,11 @@ namespace debruijn_graph { ...@@ -156,8 +204,11 @@ namespace debruijn_graph {
mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper); mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper);
INFO("Barcode index construction finished."); INFO("Barcode index construction finished.");
FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g); FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g);
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor); assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges"); INFO("Taxonomy assigned to all edges.");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_edges.fasta");
INFO("Wrote taxonomy_edges.fasta in main dir.")
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound; size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold))); INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold)));
ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper, ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment