Commit c8303759 authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

Impl. extra BarcodeIndex serialisation, use conjugate edges & edges to fasta output

barcode_index.hpp:
- Implemented taxa_codes + taxonomy_ serialisation
- Made taxa_codes::map's public

barcode_info_extractor.hpp:
- Added funcs: EdgeIsValid, GetConjugateEdge, GetSequence, GetEdgeLength

barcode_index_construction.cpp:
- Added write_contigs_to_fasta for testing purposes.
- Re-implemented taxonomy assignment to use edge & conjugate.

read_cloud_connection_conditions.cpp:
- In TaxaBreakPred.: if taxonomy of edge not found, try taxonomy of
  conjugate edge. Else taxa 0.

logger.hpp:
- Created special DEBUG_SAM for personal logging. Will change at end.
parent 5b1631c0
......@@ -78,9 +78,9 @@ namespace barcode_index {
};
class TaxIdEncoder {
public:
std::unordered_map <string, TaxId> codes_;
std::unordered_map <TaxId, string> codes_rev_;
public:
TaxIdEncoder():
codes_(),
codes_rev_()
......@@ -127,7 +127,7 @@ namespace barcode_index {
}
string GetTaxaTree (const TaxId& taxid) const {
VERIFY(codes_rev_.find(taxid) != codes_rev_.end());
VERIFY_MSG(codes_rev_.find(taxid) != codes_rev_.end(), "In GetTaxaTree: taxid not in codes_rev");
return codes_rev_.at(taxid);
}
......@@ -277,21 +277,43 @@ namespace barcode_index {
using io::binary::BinRead;
edge_to_entry_.clear();
size_t size;
BinRead(str, size);
for (size_t i = 0; i < size; ++i) {
size_t edge_to_entry_size;
BinRead(str, edge_to_entry_size);
size_t taxa_codes_size;
BinRead(str, taxa_codes_size);
size_t taxa_rev_codes_size;
BinRead(str, taxa_rev_codes_size);
for (size_t i = 0; i < edge_to_entry_size; ++i) {
EdgeId edge_id = BinRead<uint64_t>(str);
auto entry = BinRead<EdgeEntryT>(str);
edge_to_entry_.insert({std::move(edge_id), std::move(entry)});
}
for (size_t i = 0; i < taxa_codes_size; ++i) {
string taxatree = BinRead<string>(str);
TaxId taxid = BinRead<uint64_t>(str);
taxatree_codes_.codes_.insert({std::move(taxatree), std::move(taxid)});
}
for (size_t i = 0; i < taxa_rev_codes_size; ++i) {
TaxId taxid = BinRead<uint64_t>(str);
string taxatree = BinRead<string>(str);
taxatree_codes_.codes_rev_.insert({std::move(taxid), std::move(taxatree)});
}
}
virtual void BinWrite(std::ostream &str) const {
using io::binary::BinWrite;
BinWrite(str, edge_to_entry_.size());
BinWrite(str, taxatree_codes_.codes_.size());
BinWrite(str, taxatree_codes_.codes_rev_.size());
for (const auto &edge_and_entry: edge_to_entry_) {
BinWrite(str, edge_and_entry.first.int_id(), edge_and_entry.second);
}
for (const auto &taxatree_codes_entry: taxatree_codes_.codes_) {
BinWrite(str, taxatree_codes_entry.first, taxatree_codes_entry.second);
}
for (const auto &taxatree_codes_rev_entry: taxatree_codes_.codes_rev_) {
BinWrite(str, taxatree_codes_rev_entry.first, taxatree_codes_rev_entry.second);
}
}
typename barcode_map_t::const_iterator GetEntryTailsIterator(const EdgeId& edge) const {
......@@ -667,10 +689,12 @@ namespace barcode_index {
}
virtual void BinRead(std::istream &str) {
taxonomy_ = io::binary::BinRead<uint64_t>(str);
barcode_distribution_ = io::binary::BinRead<barcode_distribution_t>(str);
}
virtual void BinWrite(std::ostream &str) const {
io::binary::BinWrite(str, taxonomy_);
io::binary::BinWrite(str, barcode_distribution_);
}
......@@ -783,6 +807,7 @@ namespace barcode_index {
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_t;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_;
using EdgeEntry<Graph, FrameBarcodeInfo>::edge_;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxonomy_;
size_t edge_length_;
size_t frame_size_;
size_t number_of_frames_;
......@@ -826,6 +851,7 @@ namespace barcode_index {
edge_length_ = BinRead<size_t>(str);
frame_size_ = BinRead<size_t>(str);
number_of_frames_ = BinRead<size_t>(str);
taxonomy_ = BinRead<uint64_t>(str); //only need to save taxonomy, not taxonomy_distribution
barcode_distribution_.clear();
size_t size;
......@@ -843,6 +869,7 @@ namespace barcode_index {
BinWrite(str, edge_length_);
BinWrite(str, frame_size_);
BinWrite(str, number_of_frames_);
BinWrite(str, taxonomy_);
size_t size = barcode_distribution_.size();
BinWrite(str, size);
......
......@@ -148,8 +148,24 @@ namespace barcode_index {
return result;
}
bool EdgeIsValid(const EdgeId& edge) const {
return (index_.edge_to_entry_.find(edge) != index_.edge_to_entry_.end());
}
EdgeId GetConjugateEdge(const EdgeId& edge) const {
return g_.conjugate(edge);
}
string GetSequence(const EdgeId& edge) const {
return g_.EdgeNucls(edge).str();
}
size_t GetEdgeLength(const EdgeId& edge) const {
return g_.EdgeNucls(edge).str().length();
}
size_t GetTaxidCount(const EdgeId& edge_id_, const TaxId& taxid) const {
DEBUG("Get Taxid from EdgeId: " << edge_id_);
return index_.edge_to_entry_.at(edge_id_).taxid_distribution_.at(taxid).GetCount();
}
......@@ -162,9 +178,11 @@ namespace barcode_index {
}
string GetTaxaTreeFromEdge(const EdgeId& edge_id_) const {
//INFO("EdgeId: " << edge_id_);
DEBUG("GetTaxaTreeFromEdge EdgeId: " << edge_id_);
VERIFY_MSG(index_.edge_to_entry_.find(edge_id_) != index_.edge_to_entry_.end(),
"GetTaxaTreeFromEdge: not in edge_to_entry EgdeID = "<<edge_id_);
TaxId taxid = index_.edge_to_entry_.at(edge_id_).GetTaxonomy();
//INFO("GetTaxonomy: "<< taxid);
DEBUG("GetTaxonomy: "<< taxid);
string taxatree = TaxaTreeFromTaxId(taxid);
return taxatree;
}
......
......@@ -121,13 +121,39 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
static const std::string null_taxa = "0";
auto first = scaffold_edge.getStart();
auto second = scaffold_edge.getEnd();
DEBUG("In TaxaBreakPredicate, Edge_id: " << scaffold_edge.getId() << " length: " << scaffold_edge.getLength());
DEBUG_SAM("In TaxaBreakPredicate, Edge_id: " << scaffold_edge.getId() << " length: " << scaffold_edge.getLength());
auto seq1 = first.GetSequence(g_);
auto seq2 = second.GetSequence(g_);
DEBUG("Vertex_1_id: " << first.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(first.int_id()) << " Length: " << seq1.size());
DEBUG("Vertex_2_id: " << second.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()) << " Length: " << seq2.size());
std::string taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(first.int_id());
std::string taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(second.int_id());
DEBUG_SAM("Vertex_1_id: " << first.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(first.int_id()) << " Length: " << seq1.size());
DEBUG_SAM("Vertex_2_id: " << second.int_id() << " Taxonomy: " << barcode_extractor_->GetTaxaTreeFromEdge(second.int_id()) << " Length: " << seq2.size());
std::string taxatree_1;
EdgeId edge_1 = first.int_id();
if (barcode_extractor_->EdgeIsValid(edge_1)) {
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(edge_1);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_1))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_1 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_1));
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! ERROR!!!");
taxatree_1 = "0";
}
std::string taxatree_2;
EdgeId edge_2 = second.int_id();
if (barcode_extractor_->EdgeIsValid(edge_2)) {
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(edge_2);
}
else if (barcode_extractor_->EdgeIsValid(g_.conjugate(edge_2))) {
DEBUG_SAM("No taxatree for vector: " << first.int_id() << ", conjugate was fine.");
taxatree_2 = barcode_extractor_->GetTaxaTreeFromEdge(g_.conjugate(edge_2));
}
else {
INFO("Using taxatree of 0. No taxatree entry found for vertex or conjugate! Should not happen! ERROR!!!");
taxatree_2 = "0";
}
if (taxatree_1 == null_taxa || taxatree_2 == null_taxa ||
taxatree_1.find(taxatree_2) != std::string::npos || taxatree_2.find(taxatree_1) != std::string::npos) {
......@@ -138,14 +164,6 @@ bool TaxaBreakPredicate::Check(const ScaffoldEdge &scaffold_edge) const {
DEBUG("Taxatree mismatch!");
}
return false;
//TODO: implement taxonomy in barcode_extractor and vertexes!!
//barcode_extractor_->
//get taxonomy for first and second
//first.
//bool result = true;
//return result;
}
EdgeSplitPredicate::EdgeSplitPredicate(
......
......@@ -136,6 +136,7 @@ inline const char* __scope_source_name() {
# define TRACE(message) /* No trace */
#endif
#define INFO(message) LOG_MSG(logging::L_INFO , message)
#define DEBUG_SAM(message) LOG_MSG(logging::L_INFO, message)
#define START_BANNER(description) \
do { \
INFO("Starting " description ", built from " \
......
......@@ -100,33 +100,81 @@ namespace debruijn_graph {
return lca;
}
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor){
void write_contigs_to_fasta(const barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor, const string& file_name) {
std::ofstream fasta_taxa_file;
string fasta_location = fs::append_path(cfg::get().output_dir, file_name);
fasta_taxa_file.open (fasta_location);
std::unordered_set<EdgeId> completed_edges;
for ( auto &p : barcodeindex.edge_to_entry_ ) {
std::vector<string> taxatree_vector;
std::vector<size_t> count_vector;
EdgeId edge;
TaxId taxid;
std::string taxatree;
size_t count;
edge = p.first; // edgeID p.first; edgeEntry p.second
typename FrameBarcodeIndexInfoExtractor::taxid_distribution_t::const_iterator taxid_it;
for (taxid_it = extractor.taxid_iterator_begin(edge); taxid_it != extractor.taxid_iterator_end(edge);
taxid_it++ ) {
taxid = taxid_it->first;
taxatree = barcodeindex.taxatree_codes_.GetTaxaTree(taxid);
EdgeId edge = p.first; // edgeID p.first; edgeEntry p.second
EdgeId conjugate_edge = extractor.GetConjugateEdge(edge);
if (completed_edges.find(edge) != completed_edges.end()) {
VERIFY_MSG(completed_edges.find(conjugate_edge) != completed_edges.end(),
"Conjugate edge should be in completed_edges along with original edge.");
continue;
}
string seq = extractor.GetSequence(edge);
string taxatree = extractor.GetTaxaTreeFromEdge(edge);
TaxId taxid = barcodeindex.taxatree_codes_.ParseTaxId(taxatree);
fasta_taxa_file << ">EdgeID=" << edge << " Length=" << seq.length() << " Taxa=" << taxid << " Taxatree=" << taxatree << std::endl;
fasta_taxa_file << seq << std::endl;
}
}
void fill_taxatree_and_count_vectors_from_edge(const FrameBarcodeIndexInfoExtractor& extractor,
std::vector<string> taxatree_vector,
std::vector<size_t> count_vector, EdgeId edge) {
TaxId taxid;
std::string taxatree;
size_t count;
typename FrameBarcodeIndexInfoExtractor::taxid_distribution_t::const_iterator taxid_it;
for (taxid_it = extractor.taxid_iterator_begin(edge); taxid_it != extractor.taxid_iterator_end(edge);
taxid_it++ ) {
taxid = taxid_it->first;
taxatree = extractor.TaxaTreeFromTaxId(taxid);
auto taxatree_match = find(taxatree_vector.begin(), taxatree_vector.end(), taxatree);
if (taxatree_match == taxatree_vector.end()) {
taxatree_vector.push_back(taxatree);
count = extractor.GetTaxidCount(edge, taxid);
count_vector.push_back(count);
}
DEBUG("EdgeId: " << edge << " Frame_size: " << p.second.GetFrameSize() << " # of frames: " << p.second.GetNumberOfFrames());
else {
size_t match_index = std::distance(taxatree_vector.begin(), taxatree_match);
count_vector[match_index] += extractor.GetTaxidCount(edge, taxid);
}
}
}
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor){
std::unordered_set<EdgeId> completed_edges;
for ( auto &p : barcodeindex.edge_to_entry_ ) {
EdgeId edge = p.first; // edgeID p.first; edgeEntry p.second
EdgeId conjugate_edge = extractor.GetConjugateEdge(edge);
if (completed_edges.find(edge) != completed_edges.end()) {
VERIFY_MSG(completed_edges.find(conjugate_edge) != completed_edges.end(),
"Conjugate edge should be in completed_edges along with original edge.");
continue;
}
std::vector<string> taxatree_vector;
std::vector<size_t> count_vector;
fill_taxatree_and_count_vectors_from_edge(extractor, taxatree_vector, count_vector, edge);
fill_taxatree_and_count_vectors_from_edge(extractor, taxatree_vector, count_vector, conjugate_edge);
DEBUG_SAM("EdgeId: " << edge << " Approximate_size: " << (p.second.GetFrameSize() * p.second.GetNumberOfFrames()) );
for (size_t i = 0; i != taxatree_vector.size(); i++ ){
DEBUG("TaxaTree: " << taxatree_vector[i] << ", Count: " << count_vector[i]);
DEBUG_SAM("TaxaTree: " << taxatree_vector[i] << ", Count: " << count_vector[i]);
}
// Beware that lca function can mess with count_vector length so just use once.
TaxId lca = last_common_ancestor(taxatree_vector, count_vector, extractor);
p.second.SetTaxonomy(lca);
DEBUG("Taxonomy: " << barcodeindex.edge_to_entry_.at(edge).GetTaxonomy())
barcodeindex.edge_to_entry_.at(conjugate_edge).SetTaxonomy(lca);
completed_edges.insert(edge);
completed_edges.insert(conjugate_edge);
DEBUG_SAM("Taxonomy: " << barcodeindex.edge_to_entry_.at(edge).GetTaxonomy());
}
}
......@@ -156,8 +204,11 @@ namespace debruijn_graph {
mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper);
INFO("Barcode index construction finished.");
FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g);
INFO("Starting taxonomic assignment to all edges.")
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor);
INFO("Taxonomy assigned to all edges");
INFO("Taxonomy assigned to all edges.");
write_contigs_to_fasta(graph_pack.barcode_mapper, extractor, "taxonomy_edges.fasta");
INFO("Wrote taxonomy_edges.fasta in main dir.")
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold)));
ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment