Commit d471de51 authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

barcode_index_builder.hpp:

- Added TaxaTree (like 1.23.456.7890) parser from reads & store in index

barcode_index.hpp:
- Added data structure in BarcodeIndex to store TaxaTree mapping
  to taxids (taxatree_codes_).
- Added data structure in EdgeEntry to store taxid_distribution_.
- Added attribute to EdgeEntry for taxonomy_.

barcode_info_extractor.hpp:
- Added method to get taxids for a EdgeID
parent fd6ec33d
......@@ -75,6 +75,50 @@ namespace barcode_index {
}
};
class TaxIdEncoder {
std::unordered_map <string, uint64_t> codes_;
std::unordered_map <uint64_t, string> codes_rev_;
public:
TaxIdEncoder():
codes_(),
codes_rev_()
{ }
void AddTaxaTree(string &taxatree) {
auto it = codes_.find(taxatree);
if (it == codes_.end()) {
uint64_t taxid = ParseTaxId(taxatree);
codes_[taxatree] = taxid;
codes_rev_[taxid] = taxatree;
}
}
uint64_t ParseTaxId (const string& taxatree) const {
std::size_t found = taxatree.find_last_of('.');
string taxid_str;
if (found != std::string::npos) {
taxid_str = taxatree.substr(found + 1);
} else {
taxid_str = taxatree;
}
return std::stoi(taxid_str);
}
uint64_t GetCode (const string& taxatree) const {
VERIFY(codes_.find(taxatree) != codes_.end());
return codes_.at(taxatree);
}
string GetTaxaTree (const uint64_t& taxid) const {
VERIFY(codes_rev_.find(taxid) != codes_rev_.end());
return codes_rev_.at(taxid);
}
size_t GetSize() const {
return codes_.size();
}
};
class KmerMultiset {
boost::unordered_map<Kmer, size_t, Kmer::hash> storage_;
......@@ -106,6 +150,7 @@ namespace barcode_index {
};
typedef uint64_t BarcodeId;
typedef uint64_t TaxId;
/**
This class provides partial interface to BarcodeIndex.
......@@ -157,12 +202,13 @@ namespace barcode_index {
typedef typename Graph::EdgeId EdgeId;
typedef typename Graph::VertexId VertexId;
typedef typename omnigraph::IterationHelper <Graph, EdgeId> edge_it_helper;
typedef std::unordered_map <EdgeId, EdgeEntryT> barcode_map_t;
typedef std::unordered_map <EdgeId, EdgeEntryT> barcode_map_t; // EdgeEntryT now contains barcode & taxids
BarcodeIndex (const Graph &g) :
AbstractBarcodeIndex<Graph>(g),
edge_to_entry_(),
number_of_barcodes_(0)
number_of_barcodes_(0),
taxatree_codes_()
{}
BarcodeIndex (const BarcodeIndex& other) = default;
......@@ -264,6 +310,7 @@ namespace barcode_index {
protected:
using AbstractBarcodeIndex<Graph>::g_;
barcode_map_t edge_to_entry_;
TaxIdEncoder taxatree_codes_;
size_t number_of_barcodes_;
DECL_LOGGER("BarcodeIndex");
......@@ -516,17 +563,22 @@ namespace barcode_index {
public:
typedef typename Graph::EdgeId EdgeId;
typedef std::map <BarcodeId, EntryInfoT> barcode_distribution_t;
typedef std::map <TaxId, EntryInfoT> taxid_distribution_t;
typedef EntryInfoT barcode_info_t;
protected:
EdgeId edge_;
barcode_distribution_t barcode_distribution_;
taxid_distribution_t taxid_distribution_;
TaxId taxonomy_;
public:
EdgeEntry():
edge_(), barcode_distribution_() {};
edge_(), barcode_distribution_(), taxid_distribution_(), taxonomy_(0) {};
EdgeEntry(const EdgeId& edge) :
edge_(edge), barcode_distribution_() {}
edge_(edge), barcode_distribution_(), taxid_distribution_(), taxonomy_(0) {}
virtual ~EdgeEntry() {}
......@@ -534,6 +586,14 @@ namespace barcode_index {
return barcode_distribution_;
}
void SetTaxonomy(TaxId taxid) {
taxonomy_ = taxid;
}
TaxId GetTaxonomy() {
return taxonomy_;
}
EdgeId GetEdge() const {
return edge_;
}
......@@ -566,6 +626,23 @@ namespace barcode_index {
return barcode_distribution_.cend();
}
// necessary in barcode_info_extractor.hpp::GetTaxids
typename taxid_distribution_t::const_iterator taxid_begin() const {
return barcode_distribution_.begin();
}
typename taxid_distribution_t::const_iterator taxid_end() const {
return barcode_distribution_.end();
}
typename taxid_distribution_t::const_iterator taxid_cbegin() const {
return barcode_distribution_.cbegin();
}
typename taxid_distribution_t::const_iterator taxid_cend() const {
return barcode_distribution_.cend();
}
bool has_barcode(const BarcodeId& barcode) const {
return barcode_distribution_.find(barcode) != barcode_distribution_.end();
}
......@@ -604,6 +681,7 @@ namespace barcode_index {
virtual void InsertInfo(const BarcodeId& code, const barcode_info_t& info) = 0;
virtual void InsertBarcode(const BarcodeId& code, const size_t count, const Range& range) = 0;
virtual void InsertTaxId(const TaxId& code, const size_t count, const Range& range) = 0;
};
template<class Graph>
......@@ -615,6 +693,8 @@ namespace barcode_index {
typedef typename Graph::EdgeId EdgeId;
using EdgeEntry<Graph, SimpleBarcodeInfo>::barcode_distribution_t;
using EdgeEntry<Graph, SimpleBarcodeInfo>::barcode_distribution_;
using EdgeEntry<Graph, SimpleBarcodeInfo>::taxid_distribution_t;
using EdgeEntry<Graph, SimpleBarcodeInfo>::taxid_distribution_;
using EdgeEntry<Graph, SimpleBarcodeInfo>::edge_;
public:
......@@ -657,6 +737,15 @@ namespace barcode_index {
}
}
void InsertTaxId(const TaxId& taxid, const size_t count, const Range& range) {
if (taxid_distribution_.find(taxid) == taxid_distribution_.end()) {
SimpleBarcodeInfo info(count, range);
taxid_distribution_.insert({taxid, info});
}
else {
taxid_distribution_.at(taxid).Update(count, range);
}
}
bool IsFarFromEdgeHead(size_t gap_threshold, const SimpleBarcodeInfo& info) {
return info.GetRange().start_pos > gap_threshold;
......@@ -676,6 +765,8 @@ namespace barcode_index {
typedef typename Graph::EdgeId EdgeId;
using EdgeEntry<Graph, FrameBarcodeInfo>::barcode_distribution_t;
using EdgeEntry<Graph, FrameBarcodeInfo>::barcode_distribution_;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_t;
using EdgeEntry<Graph, FrameBarcodeInfo>::taxid_distribution_;
using EdgeEntry<Graph, FrameBarcodeInfo>::edge_;
size_t edge_length_;
size_t frame_size_;
......@@ -770,6 +861,21 @@ namespace barcode_index {
barcode_distribution_.at(barcode).Update(count, left_frame, right_frame);
}
void InsertTaxId(const TaxId& taxid, const size_t count, const Range& range) override {
DEBUG("Inserting taxid");
if (taxid_distribution_.find(taxid) == taxid_distribution_.end()) {
FrameBarcodeInfo info(number_of_frames_);
taxid_distribution_.insert({taxid, info});
}
size_t left_frame = GetFrameFromPos(range.start_pos);
size_t right_frame = GetFrameFromPos(range.end_pos);
DEBUG("Range: " << range);
DEBUG("Frames: " << left_frame << " " << right_frame);
DEBUG("Count: " << count);
VERIFY_DEV(taxid_distribution_.find(taxid) != taxid_distribution_.end());
taxid_distribution_.at(taxid).Update(count, left_frame, right_frame);
}
bool IsFarFromEdgeHead(size_t gap_threshold, const FrameBarcodeInfo& info) {
return info.GetLeftMost() > gap_threshold / frame_size_;
......
......@@ -23,6 +23,7 @@ namespace barcode_index {
typedef typename debruijn_graph::EdgeIndexHelper<InnerIndex>::CoverageAndGraphPositionFillingIndexBuilderT IndexBuilder;
typedef typename barcode_index::BarcodeIndex<Graph, BarcodeEntryT> BarcodeIndexT;
typedef typename Graph::EdgeId EdgeId;
typedef uint64_t TaxIdT;
BarcodeIndexBuilder(BarcodeIndexT &index, size_t tail_threshold) :
g_(index.GetGraph()),
......@@ -112,10 +113,12 @@ namespace barcode_index {
io::SingleRead read;
size_t counter = 0;
const std::vector<string> barcode_prefixes = {"BC:Z:", "BX:Z:"};
const std::vector<string> taxatree_prefixes = {"TaxaTree"};
for (auto stream: reads) {
while (!stream->eof()) {
*stream >> read;
string barcode_string = GetTenXBarcodeFromRead(read, barcode_prefixes);
string taxatree_string = GetTaxaTreeFromRead(read, taxatree_prefixes);
if (barcode_string != "") {
barcode_codes_.AddBarcode(barcode_string);
......@@ -124,6 +127,15 @@ namespace barcode_index {
const auto &path = mapper->MapRead(read);
InsertMappingPath(barcode, path);
}
if (taxatree_string != "") {
index_.taxatree_codes_.AddTaxaTree(taxatree_string);
uint64_t taxid_int = index_.taxatree_codes_.GetCode(taxatree_string);
TaxIdT taxid(taxid_int);
//INFO("taxatree: "+taxatree_string)
const auto &path = mapper->MapRead(read);
InsertTaxIdMappingPath(taxid, path);
}
counter++;
VERBOSE_POWER_T2(counter, 100, "Processed " << counter << " reads.");
}
......@@ -171,6 +183,51 @@ namespace barcode_index {
return result;
}
// GetTaxaTree three functions below
string GetTaxaTreeFromRead(const io::SingleRead &read, const std::vector<string>& taxatree_prefixes) {
for (const auto& prefix: taxatree_prefixes) {
size_t prefix_len = prefix.size();
size_t start_pos = read.name().find(prefix);
if (start_pos != string::npos) {
string taxatree = GetTaxaTreeFromStartPos(start_pos + prefix_len, read.name());
// if taxatree not present then set as 0
if (taxatree == "") {
taxatree == "0";
}
TRACE(taxatree);
return taxatree;
}
}
return "";
}
string GetTaxaTreeFromStartPos(const size_t start_pos, const string& read_id) {
string result = "";
for (auto it = read_id.begin() + start_pos; it != read_id.end(); ++it) {
if (std::isspace(*it)) {
return result;
}
result.push_back(*it);
}
return result;
}
// not used here anymore but possibly later so keep here.
std::vector<uint64_t> ToTaxaTreeVector(std::string& taxa_tree_string) {
std::vector<uint64_t> taxa_tree_vect;
std::string taxa;
uint64_t taxid;
std::stringstream tree_stream(taxa_tree_string); // Insert the string into a stream
while(getline(tree_stream, taxa, '.')) {
// string to uint64_t
std::stringstream taxa_stream(taxa);
taxa_stream >> taxid;
taxa_tree_vect.push_back(taxid);
}
return taxa_tree_vect;
}
KmerMultiset BuildKmerMultisetFromStream(ReadStreamPtr read_stream) {
KmerMultiset kmer_multiset;
size_t read_counter = 0;
......@@ -203,6 +260,11 @@ namespace barcode_index {
index_.edge_to_entry_.at(edge).InsertBarcode(barcode, count, range);
}
void InsertTaxId(const TaxIdT &taxid, const EdgeId &edge, size_t count, const Range &range) {
VERIFY_DEV(index_.edge_to_entry_.find(edge) != index_.edge_to_entry_.end());
index_.edge_to_entry_.at(edge).InsertTaxId(taxid, count, range);
}
bool IsAtEdgeTail(const EdgeId &edge, const Range &range) {
return range.start_pos + tail_threshold_ > g_.length(edge);
}
......@@ -225,6 +287,17 @@ namespace barcode_index {
InsertBarcode(barcode, g_.conjugate(edge), count, range.Invert(g_.length(edge)));
}
void InsertTaxIdMappingPath(const TaxIdT &taxid, const MappingPath <EdgeId> &path) {
for (size_t j = 0; j < path.size(); j++) {
InsertTaxIdWithRange(taxid, path[j].first, path[j].second.mapped_range, 1);
}
}
void InsertTaxIdWithRange(const TaxIdT &taxid, const EdgeId &edge,
const Range &range, size_t count) {
InsertTaxId(taxid, edge, count, range); // no need to check head or tail. Just map taxid to edge.
}
std::vector <tslr_barcode_library> GetLibrary(const string &tslr_dataset_name) {
std::vector <tslr_barcode_library> lib_vec;
......@@ -275,6 +348,7 @@ namespace barcode_index {
BarcodeIndexT &index_;
size_t tail_threshold_;
BarcodeEncoder barcode_codes_;
//TaxIdEncoder taxatree_codes_;
};
template<class Graph>
......
......@@ -17,10 +17,14 @@ namespace barcode_index {
class BarcodeIndexInfoExtractor {
public:
typedef typename BarcodeEntryT::barcode_distribution_t distribution_t;
typedef typename BarcodeEntryT::taxid_distribution_t taxid_distribution_t;
typedef typename BarcodeEntryT::barcode_info_t barcode_info_t;
typedef typename distribution_t::key_type barcode_info_key_t;
typedef typename distribution_t::mapped_type barcode_info_value_t;
typedef typename distribution_t::value_type barcode_info_pair_t;
typedef typename taxid_distribution_t::key_type taxid_info_key_t;
typedef typename taxid_distribution_t::mapped_type taxid_info_value_t;
typedef typename taxid_distribution_t::value_type taxid_info_pair_t;
typedef typename barcode_index::BarcodeIndex<Graph, BarcodeEntryT> BarcodeIndexT;
typedef typename Graph::EdgeId EdgeId;
typedef typename omnigraph::IterationHelper <Graph, EdgeId> edge_it_helper;
......@@ -121,7 +125,7 @@ namespace barcode_index {
std::vector<BarcodeId> GetBarcodes(const EdgeId& edge) const {
std::vector <BarcodeId> result;
auto copy_barcode_id = [&result](const barcode_info_pair_t& entry)
{result.push_back(entry.first); };
{result.push_back(entry.first); }; //lambda function, .first is key
std::for_each(barcode_iterator_begin(edge), barcode_iterator_end(edge), copy_barcode_id);
return result;
}
......@@ -136,6 +140,24 @@ namespace barcode_index {
return entry_it->second.end();
}
std::vector<TaxId> GetTaxids(const EdgeId& edge) const {
std::vector <TaxId> result;
auto copy_taxid = [&result](const taxid_info_pair_t& entry)
{result.push_back(entry.first); }; //lambda function, .first is key
std::for_each(taxid_iterator_begin(edge), taxid_iterator_end(edge), copy_taxid);
return result;
}
typename taxid_distribution_t::const_iterator taxid_iterator_begin(const EdgeId &edge) const {
auto entry_it = index_.GetEntryHeadsIterator(edge);
return entry_it->second.taxid_begin(); // second is EdgeEntry
}
typename taxid_distribution_t::const_iterator taxid_iterator_end(const EdgeId &edge) const {
auto entry_it = index_.GetEntryHeadsIterator(edge);
return entry_it->second.taxid_end(); // second is EdgeEntry
}
/**
* Proxy class representing a pair of references to BarcodeInfo
*/
......
......@@ -44,6 +44,7 @@ namespace debruijn_graph {
mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper);
INFO("Barcode index construction finished.");
FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g);
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold)));
ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper,
......@@ -55,4 +56,11 @@ namespace debruijn_graph {
}
// graph_pack.read_cloud_distribution_pack = distribution_pack;
}
void test_barcode_insertions(){
auto copy_taxid = [&result](const taxid_info_pair_t& edge)
{result.push_back(entry.first); }; //lambda function, .first is key
std::for_each(taxid_iterator_begin(edge), taxid_iterator_end(edge), copy_taxid);
return result;
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment