Skip to content

Commit e7fb7e7

Browse files
authored
Reimplement careful with optimized framework (#15)
* Add optional for readset in kmer index construction * Revive careful mode * Remove unnecessary call in tests
1 parent 9f7f859 commit e7fb7e7

File tree

4 files changed

+215
-193
lines changed

4 files changed

+215
-193
lines changed

veritymap/src/projects/veritymap/kmer_index/approx_kmer_indexer.hpp

Lines changed: 98 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -127,126 +127,115 @@ class ApproxKmerIndexer {
127127
[[nodiscard]] KmerIndexes GetKmerIndexes(const std::vector<Contig> &contigs,
128128
const kmer_filter::KmerFilter &kmer_filter,
129129
logging::Logger &logger) const {
130-
KmerIndexes kmer_indexes;
131-
for (auto it = contigs.cbegin(); it != contigs.cend(); ++it) {
132-
const Contig &contig{*it};
133-
logger.info() << "Creating index for contig " << contig.id << "\n";
134-
kmer_indexes.emplace_back(GetKmerIndex(contig, kmer_filter, it - contigs.cbegin(), logger));
135-
}
136-
return kmer_indexes;
130+
KmerIndexes kmer_indexes;
131+
for (auto it = contigs.cbegin(); it!=contigs.cend(); ++it) {
132+
const Contig &contig{*it};
133+
logger.info() << "Creating index for contig " << contig.id << "\n";
134+
kmer_indexes.emplace_back(GetKmerIndex(contig,
135+
kmer_filter,
136+
it - contigs.cbegin(),
137+
logger));
138+
}
139+
return kmer_indexes;
137140
}
138141

142+
void BanHighFreqUniqueKmers(const std::vector<Contig> &contigs,
143+
const std::vector<Contig> &readset,
144+
KmerIndexes &kmer_indexes,
145+
logging::Logger &logger) const {
146+
147+
// ban unique k-mers in assembly that have unusually high coverage
148+
const double coverage
149+
{tools::common::coverage_utils::get_coverage(contigs, readset)};
150+
const uint max_read_freq = std::max(1.,
151+
ceil(kmer_indexer_params
152+
.careful_upper_bnd_cov_mult
153+
*coverage));
154+
155+
Counter kmer_cnt;
156+
for (auto it = readset.begin(); it!=readset.end(); ++it) {
157+
logger.trace() << it - readset.begin() << " " << readset.size()
158+
<< "\n";
159+
const Contig &contig = *it;
160+
if (contig.size() < hasher.k) {
161+
continue;
162+
}
163+
KWH<htype> kwh(hasher, contig.seq, 0);
164+
while (true) {
165+
if (!kwh.hasNext()) {
166+
break;
167+
}
168+
kwh = kwh.next();
169+
const htype fhash = kwh.get_fhash();
170+
const htype rhash = kwh.get_rhash();
171+
for (const htype hash : std::vector<htype>{fhash, rhash}) {
172+
bool is_unique = false;
173+
for (const KmerIndex &index : kmer_indexes) {
174+
auto it = index.find(hash);
175+
if (it!=index.end() and it->second.size()==1) {
176+
is_unique = true;
177+
break;
178+
}
179+
}
180+
if (is_unique) {
181+
kmer_cnt[hash] += 1;
182+
}
183+
}
184+
}
185+
}
186+
187+
uint64_t n{0};
188+
for (auto &[hash, cnt] : kmer_cnt) {
189+
if (cnt > max_read_freq) {
190+
for (KmerIndex &index : kmer_indexes) {
191+
auto it = index.find(hash);
192+
if (it!=index.end()) {
193+
index.erase(it);
194+
break;
195+
}
196+
}
197+
++n;
198+
}
199+
}
200+
logger.info() << "Filtered " << n << " high multiplicity k-mers\n";
201+
}
202+
139203
public:
140-
ApproxKmerIndexer(const size_t nthreads,
141-
const RollingHash<htype> &hasher,
142-
const Config::CommonParams &common_params,
143-
const Config::KmerIndexerParams &kmer_indexer_params) : nthreads{nthreads},
144-
hasher{hasher},
145-
common_params{common_params},
146-
kmer_indexer_params{
147-
kmer_indexer_params} {}
204+
ApproxKmerIndexer(const size_t nthreads,
205+
const RollingHash<htype> &hasher,
206+
const Config::CommonParams &common_params,
207+
const Config::KmerIndexerParams &kmer_indexer_params)
208+
: nthreads{nthreads},
209+
hasher{hasher},
210+
common_params{common_params},
211+
kmer_indexer_params{
212+
kmer_indexer_params} {}
148213

149214
ApproxKmerIndexer(const ApproxKmerIndexer &) = delete;
150215
ApproxKmerIndexer(ApproxKmerIndexer &&) = delete;
151216
ApproxKmerIndexer &operator=(const ApproxKmerIndexer &) = delete;
152217
ApproxKmerIndexer &operator=(ApproxKmerIndexer &&) = delete;
153218

154-
// TODO add careful mode
155-
// TODO change readset to optional
156219
[[nodiscard]] KmerIndexes extract(const std::vector<Contig> &contigs,
157-
const std::vector<Contig> &readset,
220+
const std::optional<std::vector<Contig>> &readset_optional,
158221
logging::Logger &logger) const {
159-
const kmer_filter::KmerFilterBuilder kmer_filter_builder{nthreads, hasher, common_params, kmer_indexer_params};
160-
logger.info() << "Creating kmer filter\n";
161-
const kmer_filter::KmerFilter kmer_filter = kmer_filter_builder.GetKmerFilter(contigs, logger);
162-
logger.info() << "Finished creating kmer filter. Using it to build kmer indexes\n";
163-
KmerIndexes kmer_indexes = GetKmerIndexes(contigs, kmer_filter, logger);
164-
return kmer_indexes;
222+
const kmer_filter::KmerFilterBuilder kmer_filter_builder
223+
{nthreads, hasher, common_params, kmer_indexer_params};
224+
logger.info() << "Creating kmer filter\n";
225+
const kmer_filter::KmerFilter
226+
kmer_filter = kmer_filter_builder.GetKmerFilter(contigs, logger);
227+
logger.info()
228+
<< "Finished creating kmer filter. Using it to build kmer indexes\n";
229+
KmerIndexes kmer_indexes = GetKmerIndexes(contigs, kmer_filter, logger);
230+
if (readset_optional.has_value()) {
231+
// Careful mode
232+
logger.info()
233+
<< "Careful mode requested. Filtering high multiplicity unique k-mers\n";
234+
const std::vector<Contig> &readset = readset_optional.value();
235+
BanHighFreqUniqueKmers(contigs, readset, kmer_indexes, logger);
236+
}
237+
return kmer_indexes;
165238
}
166239
};
167240

168-
}// End namespace veritymap::kmer_index::approx_kmer_indexer
169-
170-
// uint64_t get_n_unique_kmers() {
171-
// using namespace veritymap::kmer_index::kmer_filter;
172-
// uint64_t n_unique_kmers{0};
173-
// for (auto [itcontig, itsc] = std::pair{contigs.cbegin(), approx_kmer_indexer.cbegin()};
174-
// itcontig != contigs.cend();
175-
// ++itcontig, ++itsc) {
176-
// const Contig &contig = *itcontig;
177-
// const SketchContig<htype> &sketch_contig = *itsc;
178-
// if (contig.size() < hasher.k) {
179-
// continue;
180-
// }
181-
// KWH<htype> kwh(hasher, contig.seq, 0);
182-
// while (true) {
183-
// const htype fhash = kwh.get_fhash();
184-
// const htype rhash = kwh.get_rhash();
185-
//
186-
// const KmerType kmer_type = get_kmer_type(fhash, rhash, sketch_contig, ban_filter, max_cnt);
187-
// if (kmer_type == KmerType::unique) {
188-
// ++n_unique_kmers;
189-
// }
190-
//
191-
// if (!kwh.hasNext()) {
192-
// break;
193-
// }
194-
// kwh = kwh.next();
195-
// }
196-
// }
197-
// return n_unique_kmers;
198-
// }
199-
// void ban_high_freq_unique_kmers(const std::vector<Contig> & contigs_,
200-
// const std::vector<Contig> & readset,
201-
// const double exp_base,
202-
// const int nhash,
203-
// const uint32_t nthreads) {
204-
// // If read-set is not empty, we additionally ban unique k-mers in assembly that have unusually high coverage
205-
// if (readset.empty())
206-
// return;
207-
208-
// uint64_t n_unique_kmers { get_n_unique_kmers() };
209-
210-
// const double coverage { tools::common::coverage_utils::get_coverage(contigs_, readset) };
211-
212-
// const uint max_read_freq = std::max(1., ceil(careful_upper_bnd_cov_mult * coverage));
213-
// const int nbits = std::max(1., ceil(log2(max_read_freq)));
214-
// const int l2sz = ceil(log2(
215-
// std::exp(exp_base) * ((double) n_unique_kmers)
216-
// ));
217-
218-
// sketch::cm::ccm_t cms {nbits, l2sz, nhash};
219-
220-
// for (const Contig & contig : readset) {
221-
// if (contig.size() < hasher.k) {
222-
// continue;
223-
// }
224-
// KWH<htype> kwh(hasher, contig.seq, 0);
225-
// while(true) {
226-
// const htype fhash = kwh.get_fhash();
227-
// const htype rhash = kwh.get_rhash();
228-
// std::vector<std::pair<htype, htype>> hashes { { fhash, rhash }, { rhash, fhash } };
229-
// for (const auto [x, y] : hashes) {
230-
// kmer_type::KmerType kmer_type =
231-
// veritymap::kmer_index::kmer_type::get_kmer_type(x, y,
232-
// approx_kmer_indexer,
233-
// ban_filter,
234-
// max_cnt);
235-
// if (kmer_type == kmer_type::KmerType::unique) {
236-
// if (ban_filter.contains((x))) {
237-
// continue;
238-
// } else {
239-
// cms.add(x);
240-
// if (cms.est_count(x) == max_read_freq) {
241-
// ban_filter.insert(x);
242-
// }
243-
// }
244-
// }
245-
// }
246-
// if (!kwh.hasNext()) {
247-
// break;
248-
// }
249-
// kwh = kwh.next();
250-
// }
251-
// }
252-
// }
241+
}// End namespace veritymap::kmer_index::approx_kmer_indexer

veritymap/src/projects/veritymap/kmer_index/target_indexer.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@ namespace veritymap::kmer_index {
1818
using Counter = std::unordered_map<Config::HashParams::htype, size_t>;
1919

2020
kmer_index::IndexedContigs
21-
get_indexed_targets(const std::vector<Contig>& queries,
22-
const std::vector<Contig>& targets,
23-
const std::filesystem::path& outdir,
24-
const RollingHash<Config::HashParams::htype>& hasher,
21+
get_indexed_targets(const std::optional<std::vector<Contig>> &queries,
22+
const std::vector<Contig> &targets,
23+
const std::filesystem::path &outdir,
24+
const RollingHash<Config::HashParams::htype> &hasher,
2525
const size_t nthreads,
26-
logging::Logger& logger,
27-
const std::filesystem::path& index_path,
28-
const Config::CommonParams& common_params,
29-
const Config::KmerIndexerParams& kmer_indexer_params) {
26+
logging::Logger &logger,
27+
const std::filesystem::path &index_path,
28+
const Config::CommonParams &common_params,
29+
const Config::KmerIndexerParams &kmer_indexer_params) {
3030
using htype = Config::HashParams::htype;
3131
const auto kmer_indexes_fn = outdir / "kmer_indexes.tsv";
3232

veritymap/src/projects/veritymap/veritymap.cpp

Lines changed: 41 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
#include "version/version.hpp"
1414

1515
int main(int argc, char** argv) {
16-
CLParser parser{{"output-dir=", "target=", "queries=", "threads=40",
17-
"compress", "only-index", "careful", "index=none", "config=hifi"},
16+
CLParser parser{{"output-dir=", "target=", "queries=none", "threads=40",
17+
"compress", "only-index", "careful", "index=none",
18+
"config=hifi"},
1819
{},
1920
{"o=output-dir", "t=threads"}};
2021
parser.parseCL(argc, argv);
@@ -46,41 +47,46 @@ int main(int argc, char** argv) {
4647
std::time_t now = std::chrono::system_clock::to_time_t(time_point);
4748
logger << "Launch time: " << std::put_time(std::localtime(&now), "%c %Z") << std::endl;
4849

49-
std::stringstream cmd_ss;
50-
for (size_t i = 0; i < argc; i++) {
51-
cmd_ss << argv[i] << " ";
52-
}
53-
const std::string cmd = cmd_ss.str();
54-
logger << "CMD: " << cmd << std::endl;
55-
56-
const std::filesystem::path target_path =
57-
std::filesystem::canonical(parser.getValue("target"));
58-
const std::filesystem::path queries_path =
59-
std::filesystem::canonical(parser.getValue("queries"));
60-
61-
bool to_compress = parser.getCheck("compress");
62-
bool only_index = parser.getCheck("only-index");
63-
bool careful_mode = parser.getCheck("careful");
64-
65-
const std::filesystem::path index_path = [&parser] {
66-
std::filesystem::path index_path = parser.getValue("index");
67-
if (index_path != "none") {
68-
index_path = std::filesystem::canonical(index_path);
69-
} else {
70-
index_path = "";
50+
std::stringstream cmd_ss;
51+
for (size_t i = 0; i < argc; i++) {
52+
cmd_ss << argv[i] << " ";
7153
}
72-
return index_path;
73-
}();
74-
75-
const std::filesystem::path binary_path = argv[0];
76-
const std::filesystem::path config_fn = [&parser, &logger, &binary_path] {
77-
std::string config = parser.getValue("config");
78-
std::filesystem::path dirpath = binary_path.parent_path();
79-
if (config == "hifi") {
80-
return dirpath / "config/config_tm2_hifi.tsv";
81-
} else if (config == "ont") {
82-
return dirpath / "config/config_tm2_ont.tsv";
54+
const std::string cmd = cmd_ss.str();
55+
logger << "CMD: " << cmd << std::endl;
56+
57+
const std::filesystem::path target_path =
58+
std::filesystem::canonical(parser.getValue("target"));
59+
60+
auto get_path_w_def = [&parser](const std::string &parameter) {
61+
std::filesystem::path path = parser.getValue(parameter);
62+
if (path!="none") {
63+
path = std::filesystem::canonical(path);
64+
} else {
65+
path = "";
66+
}
67+
return path;
68+
};
69+
const std::filesystem::path queries_path = get_path_w_def("queries");
70+
71+
bool to_compress = parser.getCheck("compress");
72+
bool only_index = parser.getCheck("only-index");
73+
bool careful_mode = parser.getCheck("careful");
74+
if (careful_mode and queries_path=="") {
75+
std::cerr << "Cannot use careful mode if no queries are provided\n";
76+
return 1;
8377
}
78+
79+
const std::filesystem::path index_path = get_path_w_def("index");
80+
81+
const std::filesystem::path binary_path = argv[0];
82+
const std::filesystem::path config_fn = [&parser, &logger, &binary_path] {
83+
std::string config = parser.getValue("config");
84+
std::filesystem::path dirpath = binary_path.parent_path();
85+
if (config=="hifi") {
86+
return dirpath/"config/config_tm2_hifi.tsv";
87+
} else if (config=="ont") {
88+
return dirpath/"config/config_tm2_ont.tsv";
89+
}
8490
return static_cast<std::filesystem::path>(config);
8591
}();
8692
veritymap::Config config = veritymap::Config::load_config_file(config_fn);

0 commit comments

Comments
 (0)