@@ -127,89 +127,88 @@ class ApproxKmerIndexer {
127127 [[nodiscard]] KmerIndexes GetKmerIndexes (const std::vector<Contig> &contigs,
128128 const kmer_filter::KmerFilter &kmer_filter,
129129 logging::Logger &logger) const {
130- KmerIndexes kmer_indexes;
131- for (auto it = contigs.cbegin (); it!= contigs.cend (); ++it) {
132- const Contig &contig{*it};
133- logger.info () << " Creating index for contig " << contig.id << " \n " ;
134- kmer_indexes.emplace_back (GetKmerIndex (contig,
135- kmer_filter,
136- it - contigs.cbegin (),
137- logger));
138- }
139- return kmer_indexes;
130+ KmerIndexes kmer_indexes;
131+ for (auto it = contigs.cbegin (); it != contigs.cend (); ++it) {
132+ const Contig &contig{*it};
133+ logger.info () << " Creating index for contig " << contig.id << " \n " ;
134+ kmer_indexes.emplace_back (GetKmerIndex (contig,
135+ kmer_filter,
136+ it - contigs.cbegin (),
137+ logger));
138+ }
139+ return kmer_indexes;
140140 }
141141
142- void BanHighFreqUniqueKmers (const std::vector<Contig> &contigs,
143- const std::vector<Contig> &readset,
144- KmerIndexes &kmer_indexes,
145- logging::Logger &logger) const {
146-
147- // ban unique k-mers in assembly that have unusually high coverage
148- const double coverage
149- {tools::common::coverage_utils::get_coverage (contigs, readset)};
150- const uint max_read_freq = std::max (1 .,
151- ceil (kmer_indexer_params
152- .careful_upper_bnd_cov_mult
153- *coverage));
154-
155- Counter kmer_cnt;
156- for (auto it = readset.begin (); it!=readset.end (); ++it) {
157- logger.trace () << it - readset.begin () << " " << readset.size ()
158- << " \n " ;
159- const Contig &contig = *it;
160- if (contig.size () < hasher.k ) {
161- continue ;
162- }
163- KWH<htype> kwh (hasher, contig.seq , 0 );
164- while (true ) {
165- if (!kwh.hasNext ()) {
166- break ;
167- }
168- kwh = kwh.next ();
169- const htype fhash = kwh.get_fhash ();
170- const htype rhash = kwh.get_rhash ();
171- for (const htype hash : std::vector<htype>{fhash, rhash}) {
172- bool is_unique = false ;
173- for (const KmerIndex &index : kmer_indexes) {
174- auto it = index.find (hash);
175- if (it!=index.end () and it->second .size ()==1 ) {
176- is_unique = true ;
177- break ;
178- }
179- }
180- if (is_unique) {
181- kmer_cnt[hash] += 1 ;
182- }
183- }
142+ void BanHighFreqUniqueKmers (const std::vector<Contig> &contigs,
143+ const std::vector<Contig> &readset,
144+ KmerIndexes &kmer_indexes,
145+ logging::Logger &logger) const {
146+
147+ // ban unique k-mers in assembly that have unusually high coverage
148+ const double coverage{tools::common::coverage_utils::get_coverage (contigs, readset)};
149+ const uint max_read_freq = std::max (1 .,
150+ ceil (kmer_indexer_params
151+ .careful_upper_bnd_cov_mult
152+ * coverage));
153+
154+ Counter kmer_cnt;
155+ for (auto it = readset.begin (); it != readset.end (); ++it) {
156+ logger.trace () << it - readset.begin () << " " << readset.size ()
157+ << " \n " ;
158+ const Contig &contig = *it;
159+ if (contig.size () < hasher.k ) {
160+ continue ;
161+ }
162+ KWH<htype> kwh (hasher, contig.seq , 0 );
163+ while (true ) {
164+ if (!kwh.hasNext ()) {
165+ break ;
166+ }
167+ kwh = kwh.next ();
168+ const htype fhash = kwh.get_fhash ();
169+ const htype rhash = kwh.get_rhash ();
170+ for (const htype hash : std::vector<htype>{fhash, rhash}) {
171+ bool is_unique = false ;
172+ for (const KmerIndex &index : kmer_indexes) {
173+ auto it = index.find (hash);
174+ if (it != index.end () and it->second .size () == 1 ) {
175+ is_unique = true ;
176+ break ;
184177 }
178+ }
179+ if (is_unique) {
180+ kmer_cnt[hash] += 1 ;
181+ }
185182 }
183+ }
184+ }
186185
187- uint64_t n{0 };
188- for (auto &[hash, cnt] : kmer_cnt) {
189- if (cnt > max_read_freq) {
190- for (KmerIndex &index : kmer_indexes) {
191- auto it = index.find (hash);
192- if (it!=index.end ()) {
193- index.erase (it);
194- break ;
195- }
196- }
197- ++n;
198- }
186+ uint64_t n{0 };
187+ for (auto &[hash, cnt] : kmer_cnt) {
188+ if (cnt > max_read_freq) {
189+ for (KmerIndex &index : kmer_indexes) {
190+ auto it = index.find (hash);
191+ if (it != index.end ()) {
192+ index.erase (it);
193+ break ;
194+ }
199195 }
200- logger.info () << " Filtered " << n << " high multiplicity k-mers\n " ;
196+ ++n;
197+ }
201198 }
199+ logger.info () << " Filtered " << n << " high multiplicity k-mers\n " ;
200+ }
202201
203202 public:
204- ApproxKmerIndexer (const size_t nthreads,
205- const RollingHash<htype> &hasher,
206- const Config::CommonParams &common_params,
207- const Config::KmerIndexerParams &kmer_indexer_params)
208- : nthreads{nthreads},
209- hasher{hasher},
210- common_params{common_params},
211- kmer_indexer_params{
212- kmer_indexer_params} {}
203+ ApproxKmerIndexer (const size_t nthreads,
204+ const RollingHash<htype> &hasher,
205+ const Config::CommonParams &common_params,
206+ const Config::KmerIndexerParams &kmer_indexer_params)
207+ : nthreads{nthreads},
208+ hasher{hasher},
209+ common_params{common_params},
210+ kmer_indexer_params{
211+ kmer_indexer_params} {}
213212
214213 ApproxKmerIndexer (const ApproxKmerIndexer &) = delete ;
215214 ApproxKmerIndexer (ApproxKmerIndexer &&) = delete ;
@@ -219,22 +218,21 @@ class ApproxKmerIndexer {
219218 [[nodiscard]] KmerIndexes extract (const std::vector<Contig> &contigs,
220219 const std::optional<std::vector<Contig>> &readset_optional,
221220 logging::Logger &logger) const {
222- const kmer_filter::KmerFilterBuilder kmer_filter_builder
223- {nthreads, hasher, common_params, kmer_indexer_params};
224- logger.info () << " Creating kmer filter\n " ;
225- const kmer_filter::KmerFilter
226- kmer_filter = kmer_filter_builder.GetKmerFilter (contigs, logger);
221+ const kmer_filter::KmerFilterBuilder kmer_filter_builder{nthreads, hasher, common_params, kmer_indexer_params};
222+ logger.info () << " Creating kmer filter\n " ;
223+ const kmer_filter::KmerFilter
224+ kmer_filter = kmer_filter_builder.GetKmerFilter (contigs, logger);
225+ logger.info ()
226+ << " Finished creating kmer filter. Using it to build kmer indexes\n " ;
227+ KmerIndexes kmer_indexes = GetKmerIndexes (contigs, kmer_filter, logger);
228+ if (readset_optional.has_value ()) {
229+ // Careful mode
227230 logger.info ()
228- << " Finished creating kmer filter. Using it to build kmer indexes\n " ;
229- KmerIndexes kmer_indexes = GetKmerIndexes (contigs, kmer_filter, logger);
230- if (readset_optional.has_value ()) {
231- // Careful mode
232- logger.info ()
233- << " Careful mode requested. Filtering high multiplicity unique k-mers\n " ;
234- const std::vector<Contig> &readset = readset_optional.value ();
235- BanHighFreqUniqueKmers (contigs, readset, kmer_indexes, logger);
236- }
237- return kmer_indexes;
231+ << " Careful mode requested. Filtering high multiplicity unique k-mers\n " ;
232+ const std::vector<Contig> &readset = readset_optional.value ();
233+ BanHighFreqUniqueKmers (contigs, readset, kmer_indexes, logger);
234+ }
235+ return kmer_indexes;
238236 }
239237};
240238
0 commit comments