Commit ba6cfe86 authored by Overduin, Sam's avatar Overduin, Sam Committed by Overduin, Sam
Browse files

barcode_index_builder.hpp:

- Added no TaxaTree in read means taxa '0'
- Bugfix taxatree_prefixes
- Cut out taxatree parser to vector<int>

barcode_index.hpp:
- Changed TaxIdEncoder to fully use TaxId vs int64
- small bugfixes eg: moved declarations around, corrected taxid_begin()
- made edge_to_entry in FrameBarcodeIndex public

barcode_info_extractor.hpp:
- Added GetTaxidCount

barcode_index_construction.cpp:
- Added chunk of code to determine and set LCA per EdgeEntry in barcode_index
parent d471de51
......@@ -4,9 +4,7 @@ from kraken to read header -> done!!
To test:
/mnt/LTR_userdata/overd008/tools/spades_install/bin/spades-core /home/overd008/mnt/lustre/output/spades/test/K55/configs/config.info /home/overd008/mnt/lustre/output/spades/test/K55/configs/mda_mode.info /home/overd008/mnt/lustre/output/spades/test/K55/configs/meta_mode.info
---------here-----------
## Parse taxonomy from reads
## Parse taxonomy from reads --> done!
Barcode mapping parser
common\barcode_index\barcode_index_builder.hpp @105 FillMapFrom10XReads
@118 add string tax_string = GetTaxaTreeFromRead(read)
......@@ -14,9 +12,11 @@ common\barcode_index\barcode_index_builder.hpp @105 FillMapFrom10XReads
@150 copy similar parser
## Settle taxonomy of edges
## Settle taxonomy of edges --> done!
New function (implement where? Just before transition taxa filter? Seperately?)
---------here-----------
## Add transition taxonomy filter here:
M:\aa_Thesis_code\SPAdes-3.13.0-dev\src\common\modules\path_extend\read_cloud_path_extend\scaffold_graph_construction\
scaffold_graph_construction_pipeline.cpp@209
......@@ -49,6 +49,8 @@ namespace barcode_index {
string barcode_;
};
typedef uint64_t BarcodeId;
typedef uint64_t TaxId;
class BarcodeEncoder {
std::unordered_map <string, uint64_t> codes_;
......@@ -76,8 +78,8 @@ namespace barcode_index {
};
class TaxIdEncoder {
std::unordered_map <string, uint64_t> codes_;
std::unordered_map <uint64_t, string> codes_rev_;
std::unordered_map <string, TaxId> codes_;
std::unordered_map <TaxId, string> codes_rev_;
public:
TaxIdEncoder():
codes_(),
......@@ -87,13 +89,13 @@ namespace barcode_index {
void AddTaxaTree(string &taxatree) {
auto it = codes_.find(taxatree);
if (it == codes_.end()) {
uint64_t taxid = ParseTaxId(taxatree);
TaxId taxid = ParseTaxId(taxatree);
codes_[taxatree] = taxid;
codes_rev_[taxid] = taxatree;
}
}
uint64_t ParseTaxId (const string& taxatree) const {
TaxId ParseTaxId (const string& taxatree) const {
std::size_t found = taxatree.find_last_of('.');
string taxid_str;
if (found != std::string::npos) {
......@@ -104,12 +106,12 @@ namespace barcode_index {
return std::stoi(taxid_str);
}
uint64_t GetCode (const string& taxatree) const {
TaxId GetCode (const string& taxatree) const {
VERIFY(codes_.find(taxatree) != codes_.end());
return codes_.at(taxatree);
}
string GetTaxaTree (const uint64_t& taxid) const {
string GetTaxaTree (const TaxId& taxid) const {
VERIFY(codes_rev_.find(taxid) != codes_rev_.end());
return codes_rev_.at(taxid);
}
......@@ -149,8 +151,6 @@ namespace barcode_index {
}
};
typedef uint64_t BarcodeId;
typedef uint64_t TaxId;
/**
This class provides partial interface to BarcodeIndex.
......@@ -310,8 +310,8 @@ namespace barcode_index {
protected:
using AbstractBarcodeIndex<Graph>::g_;
barcode_map_t edge_to_entry_;
TaxIdEncoder taxatree_codes_;
size_t number_of_barcodes_;
TaxIdEncoder taxatree_codes_;
DECL_LOGGER("BarcodeIndex");
};
......@@ -586,7 +586,7 @@ namespace barcode_index {
return barcode_distribution_;
}
void SetTaxonomy(TaxId taxid) {
void SetTaxonomy(const TaxId taxid) {
taxonomy_ = taxid;
}
......@@ -628,19 +628,19 @@ namespace barcode_index {
// necessary in barcode_info_extractor.hpp::GetTaxids
typename taxid_distribution_t::const_iterator taxid_begin() const {
return barcode_distribution_.begin();
return taxid_distribution_.begin();
}
typename taxid_distribution_t::const_iterator taxid_end() const {
return barcode_distribution_.end();
return taxid_distribution_.end();
}
typename taxid_distribution_t::const_iterator taxid_cbegin() const {
return barcode_distribution_.cbegin();
return taxid_distribution_.cbegin();
}
typename taxid_distribution_t::const_iterator taxid_cend() const {
return barcode_distribution_.cend();
return taxid_distribution_.cend();
}
bool has_barcode(const BarcodeId& barcode) const {
......@@ -904,6 +904,8 @@ namespace barcode_index {
friend class BarcodeIndexInfoExtractor<Graph, FrameEdgeEntry<Graph>>;
public:
using BarcodeIndex<Graph, FrameEdgeEntry<Graph>>::barcode_map_t;
using BarcodeIndex<Graph, FrameEdgeEntry<Graph>>::edge_to_entry_;
using BarcodeIndex<Graph, FrameEdgeEntry<Graph>>::taxatree_codes_;
typedef typename Graph::EdgeId EdgeId;
typedef typename omnigraph::IterationHelper <Graph, EdgeId> edge_it_helper;
......@@ -932,7 +934,6 @@ namespace barcode_index {
private:
using BarcodeIndex<Graph, FrameEdgeEntry<Graph>>::g_;
using BarcodeIndex<Graph, FrameEdgeEntry<Graph>>::edge_to_entry_;
size_t frame_size_;
};
} //barcode_index
......@@ -113,7 +113,7 @@ namespace barcode_index {
io::SingleRead read;
size_t counter = 0;
const std::vector<string> barcode_prefixes = {"BC:Z:", "BX:Z:"};
const std::vector<string> taxatree_prefixes = {"TaxaTree"};
const std::vector<string> taxatree_prefixes = {"TaxaTree:"};
for (auto stream: reads) {
while (!stream->eof()) {
*stream >> read;
......@@ -128,14 +128,18 @@ namespace barcode_index {
InsertMappingPath(barcode, path);
}
if (taxatree_string != "") {
index_.taxatree_codes_.AddTaxaTree(taxatree_string);
uint64_t taxid_int = index_.taxatree_codes_.GetCode(taxatree_string);
TaxIdT taxid(taxid_int);
//INFO("taxatree: "+taxatree_string)
const auto &path = mapper->MapRead(read);
InsertTaxIdMappingPath(taxid, path);
if (taxatree_string == "") {
// no TaxaTree added means no known taxa (0), useful during LCA determination.
taxatree_string = '0';
}
index_.taxatree_codes_.AddTaxaTree(taxatree_string);
uint64_t taxid_int = index_.taxatree_codes_.GetCode(taxatree_string);
TaxIdT taxid(taxid_int);
//INFO("taxatree: "+taxatree_string)
const auto &path = mapper->MapRead(read);
InsertTaxIdMappingPath(taxid, path);
counter++;
VERBOSE_POWER_T2(counter, 100, "Processed " << counter << " reads.");
}
......@@ -212,22 +216,6 @@ namespace barcode_index {
return result;
}
// not used here anymore but possibly later so keep here.
std::vector<uint64_t> ToTaxaTreeVector(std::string& taxa_tree_string) {
std::vector<uint64_t> taxa_tree_vect;
std::string taxa;
uint64_t taxid;
std::stringstream tree_stream(taxa_tree_string); // Insert the string into a stream
while(getline(tree_stream, taxa, '.')) {
// string to uint64_t
std::stringstream taxa_stream(taxa);
taxa_stream >> taxid;
taxa_tree_vect.push_back(taxid);
}
return taxa_tree_vect;
}
KmerMultiset BuildKmerMultisetFromStream(ReadStreamPtr read_stream) {
KmerMultiset kmer_multiset;
size_t read_counter = 0;
......
......@@ -141,13 +141,18 @@ namespace barcode_index {
}
std::vector<TaxId> GetTaxids(const EdgeId& edge) const {
std::vector <TaxId> result;
std::vector<TaxId> result;
auto copy_taxid = [&result](const taxid_info_pair_t& entry)
{result.push_back(entry.first); }; //lambda function, .first is key
{result.push_back(entry.first); }; //lambda function, .first is key of taxid_distribution
std::for_each(taxid_iterator_begin(edge), taxid_iterator_end(edge), copy_taxid);
return result;
}
size_t GetTaxidCount(const EdgeId& edge_id_, const TaxId& taxid) const {
return index_.edge_to_entry_.at(edge_id_).taxid_distribution_.at(taxid).GetCount();
}
typename taxid_distribution_t::const_iterator taxid_iterator_begin(const EdgeId &edge) const {
auto entry_it = index_.GetEntryHeadsIterator(edge);
return entry_it->second.taxid_begin(); // second is EdgeEntry
......
......@@ -18,6 +18,99 @@ namespace debruijn_graph {
return has_read_clouds;
}
std::vector<TaxId> ToTaxaTreeVector(const std::string& taxa_tree_string, const char sep='.') {
std::vector<TaxId> taxa_tree_vect;
std::string taxa;
TaxId taxid;
std::stringstream tree_stream(taxa_tree_string); // Insert the string into a stream
while(getline(tree_stream, taxa, sep)) {
// string to uint64_t
std::stringstream taxa_stream(taxa);
taxa_stream >> taxid;
taxa_tree_vect.push_back(taxid);
}
return taxa_tree_vect;
}
TaxId majority_vote_lca(std::vector<string> &taxatree_str_vec, std::vector<size_t> &count_vec) {
std::vector<std::vector<TaxId>> taxatree_vec;
size_t longest_lineage = 0;
size_t total_counts = 0;
size_t null_counts = 0;
// transfer taxatree_strings to taxid_vectors.
for ( const std::string& taxatree_str : taxatree_str_vec ) {
if (taxatree_str != "0") {
std::vector<TaxId> taxid_vec = ToTaxaTreeVector(taxatree_str, '.');
taxatree_vec.push_back(taxid_vec);
longest_lineage = std::max(taxid_vec.size(), longest_lineage);
}
else {
// remove taxa 0 but add counts to total_counts
// ToDo: make count_vec into lst for removal optimization
auto it = std::find(taxatree_str_vec.begin(), taxatree_str_vec.end(), taxatree_str);
auto no_taxa_index = std::distance(taxatree_str_vec.begin(), it);
null_counts += count_vec[no_taxa_index];
count_vec.erase(count_vec.begin() + no_taxa_index);
}
}
for (const size_t& count : count_vec) {
total_counts += count;
}
size_t min_majority = total_counts/2.0;
//min_majority = 3; //temporary for testing purposes ToDo: remove this one.
// find index of taxid with highest count
auto most_common_index = std::distance(count_vec.begin(),
std::max_element(count_vec.begin(), count_vec.end()));
if (count_vec[most_common_index] > min_majority and total_counts > (0.2*(total_counts + null_counts))){
return taxatree_vec[most_common_index].back();
}
return 0;
}
TaxId last_common_ancestor(std::vector<string> &taxatree_vec, std::vector<size_t> &count_vec) {
VERIFY_MSG(taxatree_vec.size() == count_vec.size(), "ERROR: taxatree_vec.size is not count_vec.size during lca");
// wrapper to change underlying lca_algorithm.
TaxId lca = 0;
lca = majority_vote_lca(taxatree_vec, count_vec);
return lca;
}
void assign_taxonomy_to_edges(barcode_index::FrameBarcodeIndex<debruijn_graph::DeBruijnGraph>& barcodeindex,
const FrameBarcodeIndexInfoExtractor& extractor){
//for edge in edge_iterator
// get taxid_lst + count_lst
// taxid_lst to taxatree_lst
// LCA = lca_method(taxatree_lst, count_lst)
// edge.SetTaxonomy(LCA)
for ( auto &p : barcodeindex.edge_to_entry_ ) {
std::vector<string> taxatree_vector;
std::vector<size_t> count_vector;
EdgeId edge;
TaxId taxid;
std::string taxatree;
size_t count;
edge = p.first; // edgeID p.first; edgeEntry p.second
typename FrameBarcodeIndexInfoExtractor::taxid_distribution_t::const_iterator taxid_it;
for (taxid_it = extractor.taxid_iterator_begin(edge); taxid_it != extractor.taxid_iterator_end(edge);
taxid_it++ ) {
taxid = taxid_it->first;
taxatree = barcodeindex.taxatree_codes_.GetTaxaTree(taxid);
taxatree_vector.push_back(taxatree);
count = extractor.GetTaxidCount(edge, taxid);
count_vector.push_back(count);
}
TaxId lca = last_common_ancestor(taxatree_vector, count_vector);
p.second.SetTaxonomy(lca);
INFO("EdgeId: " << edge);
for (size_t i = 0; i != taxatree_vector.size(); i++ ){
INFO("TaxaTree: " << taxatree_vector[i] << ", Count: " << count_vector[i]);
}
INFO("Taxonomy: " << barcodeindex.edge_to_entry_.at(edge).GetTaxonomy())
}
}
void BarcodeMapConstructionStage::run(debruijn_graph::conj_graph_pack &graph_pack, const char *) {
using path_extend::read_cloud::fragment_statistics::ClusterStatisticsExtractorHelper;
......@@ -44,7 +137,8 @@ namespace debruijn_graph {
mapper_builder.FillMap(reads, graph_pack.index, graph_pack.kmer_mapper);
INFO("Barcode index construction finished.");
FrameBarcodeIndexInfoExtractor extractor(graph_pack.barcode_mapper, graph_pack.g);
assign_taxonomy_to_edges(graph_pack.barcode_mapper, extractor); // function remade as method of barcode_mapper
INFO("Taxonomy assigned to all edges");
size_t length_threshold = cfg::get().pe_params.read_cloud.long_edge_length_lower_bound;
INFO("Average barcode coverage: " + std::to_string(extractor.AverageBarcodeCoverage(length_threshold)));
ClusterStatisticsExtractorHelper cluster_extractor_helper(graph_pack.g, graph_pack.barcode_mapper,
......@@ -57,10 +151,4 @@ namespace debruijn_graph {
// graph_pack.read_cloud_distribution_pack = distribution_pack;
}
void test_barcode_insertions(){
auto copy_taxid = [&result](const taxid_info_pair_t& edge)
{result.push_back(entry.first); }; //lambda function, .first is key
std::for_each(taxid_iterator_begin(edge), taxid_iterator_end(edge), copy_taxid);
return result;
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment