@@ -127,126 +127,115 @@ class ApproxKmerIndexer {
127127 [[nodiscard]] KmerIndexes GetKmerIndexes (const std::vector<Contig> &contigs,
128128 const kmer_filter::KmerFilter &kmer_filter,
129129 logging::Logger &logger) const {
130- KmerIndexes kmer_indexes;
131- for (auto it = contigs.cbegin (); it != contigs.cend (); ++it) {
132- const Contig &contig{*it};
133- logger.info () << " Creating index for contig " << contig.id << " \n " ;
134- kmer_indexes.emplace_back (GetKmerIndex (contig, kmer_filter, it - contigs.cbegin (), logger));
135- }
136- return kmer_indexes;
130+ KmerIndexes kmer_indexes;
131+ for (auto it = contigs.cbegin (); it!=contigs.cend (); ++it) {
132+ const Contig &contig{*it};
133+ logger.info () << " Creating index for contig " << contig.id << " \n " ;
134+ kmer_indexes.emplace_back (GetKmerIndex (contig,
135+ kmer_filter,
136+ it - contigs.cbegin (),
137+ logger));
138+ }
139+ return kmer_indexes;
137140 }
138141
142+ void BanHighFreqUniqueKmers (const std::vector<Contig> &contigs,
143+ const std::vector<Contig> &readset,
144+ KmerIndexes &kmer_indexes,
145+ logging::Logger &logger) const {
146+
147+ // ban unique k-mers in assembly that have unusually high coverage
148+ const double coverage
149+ {tools::common::coverage_utils::get_coverage (contigs, readset)};
150+ const uint max_read_freq = std::max (1 .,
151+ ceil (kmer_indexer_params
152+ .careful_upper_bnd_cov_mult
153+ *coverage));
154+
155+ Counter kmer_cnt;
156+ for (auto it = readset.begin (); it!=readset.end (); ++it) {
157+ logger.trace () << it - readset.begin () << " " << readset.size ()
158+ << " \n " ;
159+ const Contig &contig = *it;
160+ if (contig.size () < hasher.k ) {
161+ continue ;
162+ }
163+ KWH<htype> kwh (hasher, contig.seq , 0 );
164+ while (true ) {
165+ if (!kwh.hasNext ()) {
166+ break ;
167+ }
168+ kwh = kwh.next ();
169+ const htype fhash = kwh.get_fhash ();
170+ const htype rhash = kwh.get_rhash ();
171+ for (const htype hash : std::vector<htype>{fhash, rhash}) {
172+ bool is_unique = false ;
173+ for (const KmerIndex &index : kmer_indexes) {
174+ auto it = index.find (hash);
175+ if (it!=index.end () and it->second .size ()==1 ) {
176+ is_unique = true ;
177+ break ;
178+ }
179+ }
180+ if (is_unique) {
181+ kmer_cnt[hash] += 1 ;
182+ }
183+ }
184+ }
185+ }
186+
187+ uint64_t n{0 };
188+ for (auto &[hash, cnt] : kmer_cnt) {
189+ if (cnt > max_read_freq) {
190+ for (KmerIndex &index : kmer_indexes) {
191+ auto it = index.find (hash);
192+ if (it!=index.end ()) {
193+ index.erase (it);
194+ break ;
195+ }
196+ }
197+ ++n;
198+ }
199+ }
200+ logger.info () << " Filtered " << n << " high multiplicity k-mers\n " ;
201+ }
202+
139203 public:
140- ApproxKmerIndexer (const size_t nthreads,
141- const RollingHash<htype> &hasher,
142- const Config::CommonParams &common_params,
143- const Config::KmerIndexerParams &kmer_indexer_params) : nthreads{nthreads},
144- hasher{hasher},
145- common_params{common_params},
146- kmer_indexer_params{
147- kmer_indexer_params} {}
204+ ApproxKmerIndexer (const size_t nthreads,
205+ const RollingHash<htype> &hasher,
206+ const Config::CommonParams &common_params,
207+ const Config::KmerIndexerParams &kmer_indexer_params)
208+ : nthreads{nthreads},
209+ hasher{hasher},
210+ common_params{common_params},
211+ kmer_indexer_params{
212+ kmer_indexer_params} {}
148213
149214 ApproxKmerIndexer (const ApproxKmerIndexer &) = delete ;
150215 ApproxKmerIndexer (ApproxKmerIndexer &&) = delete ;
151216 ApproxKmerIndexer &operator =(const ApproxKmerIndexer &) = delete ;
152217 ApproxKmerIndexer &operator =(ApproxKmerIndexer &&) = delete ;
153218
154- // TODO add careful mode
155- // TODO change readset to optional
156219 [[nodiscard]] KmerIndexes extract (const std::vector<Contig> &contigs,
157- const std::vector<Contig> &readset ,
220+ const std::optional<std:: vector<Contig>> &readset_optional ,
158221 logging::Logger &logger) const {
159- const kmer_filter::KmerFilterBuilder kmer_filter_builder{nthreads, hasher, common_params, kmer_indexer_params};
160- logger.info () << " Creating kmer filter\n " ;
161- const kmer_filter::KmerFilter kmer_filter = kmer_filter_builder.GetKmerFilter (contigs, logger);
162- logger.info () << " Finished creating kmer filter. Using it to build kmer indexes\n " ;
163- KmerIndexes kmer_indexes = GetKmerIndexes (contigs, kmer_filter, logger);
164- return kmer_indexes;
222+ const kmer_filter::KmerFilterBuilder kmer_filter_builder
223+ {nthreads, hasher, common_params, kmer_indexer_params};
224+ logger.info () << " Creating kmer filter\n " ;
225+ const kmer_filter::KmerFilter
226+ kmer_filter = kmer_filter_builder.GetKmerFilter (contigs, logger);
227+ logger.info ()
228+ << " Finished creating kmer filter. Using it to build kmer indexes\n " ;
229+ KmerIndexes kmer_indexes = GetKmerIndexes (contigs, kmer_filter, logger);
230+ if (readset_optional.has_value ()) {
231+ // Careful mode
232+ logger.info ()
233+ << " Careful mode requested. Filtering high multiplicity unique k-mers\n " ;
234+ const std::vector<Contig> &readset = readset_optional.value ();
235+ BanHighFreqUniqueKmers (contigs, readset, kmer_indexes, logger);
236+ }
237+ return kmer_indexes;
165238 }
166239};
167240
168- }// End namespace veritymap::kmer_index::approx_kmer_indexer
169-
170- // uint64_t get_n_unique_kmers() {
171- // using namespace veritymap::kmer_index::kmer_filter;
172- // uint64_t n_unique_kmers{0};
173- // for (auto [itcontig, itsc] = std::pair{contigs.cbegin(), approx_kmer_indexer.cbegin()};
174- // itcontig != contigs.cend();
175- // ++itcontig, ++itsc) {
176- // const Contig &contig = *itcontig;
177- // const SketchContig<htype> &sketch_contig = *itsc;
178- // if (contig.size() < hasher.k) {
179- // continue;
180- // }
181- // KWH<htype> kwh(hasher, contig.seq, 0);
182- // while (true) {
183- // const htype fhash = kwh.get_fhash();
184- // const htype rhash = kwh.get_rhash();
185- //
186- // const KmerType kmer_type = get_kmer_type(fhash, rhash, sketch_contig, ban_filter, max_cnt);
187- // if (kmer_type == KmerType::unique) {
188- // ++n_unique_kmers;
189- // }
190- //
191- // if (!kwh.hasNext()) {
192- // break;
193- // }
194- // kwh = kwh.next();
195- // }
196- // }
197- // return n_unique_kmers;
198- // }
199- // void ban_high_freq_unique_kmers(const std::vector<Contig> & contigs_,
200- // const std::vector<Contig> & readset,
201- // const double exp_base,
202- // const int nhash,
203- // const uint32_t nthreads) {
204- // // If read-set is not empty, we additionally ban unique k-mers in assembly that have unusually high coverage
205- // if (readset.empty())
206- // return;
207-
208- // uint64_t n_unique_kmers { get_n_unique_kmers() };
209-
210- // const double coverage { tools::common::coverage_utils::get_coverage(contigs_, readset) };
211-
212- // const uint max_read_freq = std::max(1., ceil(careful_upper_bnd_cov_mult * coverage));
213- // const int nbits = std::max(1., ceil(log2(max_read_freq)));
214- // const int l2sz = ceil(log2(
215- // std::exp(exp_base) * ((double) n_unique_kmers)
216- // ));
217-
218- // sketch::cm::ccm_t cms {nbits, l2sz, nhash};
219-
220- // for (const Contig & contig : readset) {
221- // if (contig.size() < hasher.k) {
222- // continue;
223- // }
224- // KWH<htype> kwh(hasher, contig.seq, 0);
225- // while(true) {
226- // const htype fhash = kwh.get_fhash();
227- // const htype rhash = kwh.get_rhash();
228- // std::vector<std::pair<htype, htype>> hashes { { fhash, rhash }, { rhash, fhash } };
229- // for (const auto [x, y] : hashes) {
230- // kmer_type::KmerType kmer_type =
231- // veritymap::kmer_index::kmer_type::get_kmer_type(x, y,
232- // approx_kmer_indexer,
233- // ban_filter,
234- // max_cnt);
235- // if (kmer_type == kmer_type::KmerType::unique) {
236- // if (ban_filter.contains((x))) {
237- // continue;
238- // } else {
239- // cms.add(x);
240- // if (cms.est_count(x) == max_read_freq) {
241- // ban_filter.insert(x);
242- // }
243- // }
244- // }
245- // }
246- // if (!kwh.hasNext()) {
247- // break;
248- // }
249- // kwh = kwh.next();
250- // }
251- // }
252- // }
241+ }// End namespace veritymap::kmer_index::approx_kmer_indexer
0 commit comments