Skip to content

Commit f25555b

Browse files
authored
Merge pull request #1746 from dib-lab/feature/consume_with_mask
[MRG] New consume function with mask
2 parents c19d0f1 + 2bae865 commit f25555b

File tree

8 files changed

+380
-4
lines changed

8 files changed

+380
-4
lines changed

CHANGELOG.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@ under semantic versioning, but will be in future versions of khmer.
1212
- Cython wrapper for liboxli.
1313
- Cython containers for parsing, assembly, and hashing.
1414
- Header install for liboxli.
15-
- New function `consume_fasta_banding` for bulk loading of sequences into
16-
hashtables. Only k-mers whose hashed values fall within a specified range are
17-
counted.
1815
- New storage class using a Counting Quotient Filter with improved cache
1916
locality over bloom filters.
17+
- New variants of the sequence bulk loading method with a "banding" mode and a
18+
"mask" mode. In "banding" mode, only k-mers whose hashed values fall within a
19+
specified range are counted. In "mask" mode, only k-mers not already pressent
20+
in the specified mask are counted.
21+
- `consume_seqfile_banding`
22+
- `consume_seqfile_with_mask`
23+
- `consume_seqfile_banding_with_mask`
2024

2125
### Changed
2226
- Non-ACTG handling significantly changed so that only bulk-loading functions

include/khmer/_cpy_hashtable.hh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,19 @@ hashtable_count(khmer_KHashtable_Object * me, PyObject * args);
4747
PyObject *
4848
hashtable_consume_seqfile(khmer_KHashtable_Object * me, PyObject * args);
4949

50+
5051
PyObject *
5152
hashtable_consume_seqfile_banding(khmer_KHashtable_Object * me, PyObject * args);
5253

5354

55+
PyObject *
56+
hashtable_consume_seqfile_with_mask(khmer_KHashtable_Object * me, PyObject * args);
57+
58+
59+
PyObject *
60+
hashtable_consume_seqfile_banding_with_mask(khmer_KHashtable_Object * me, PyObject * args);
61+
62+
5463
PyObject *
5564
hashtable_consume_seqfile_with_reads_parser(khmer_KHashtable_Object * me,
5665
PyObject * args);

include/oxli/hashtable.hh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,24 @@ public:
277277
unsigned long long &n_consumed
278278
);
279279

280+
template<typename SeqIO>
281+
void consume_seqfile_with_mask(
282+
std::string const &filename,
283+
Hashtable* mask,
284+
unsigned int threshold,
285+
unsigned int &total_reads,
286+
unsigned long long &n_consumed
287+
);
288+
289+
template<typename SeqIO>
290+
void consume_seqfile_with_mask(
291+
read_parsers::ReadParserPtr<SeqIO>& parser,
292+
Hashtable* mask,
293+
unsigned int threshold,
294+
unsigned int &total_reads,
295+
unsigned long long &n_consumed
296+
);
297+
280298
// Consume sequences in k-mer banding mode.
281299
template<typename SeqIO>
282300
void consume_seqfile_banding(
@@ -297,6 +315,28 @@ public:
297315
unsigned long long &n_consumed
298316
);
299317

318+
template<typename SeqIO>
319+
void consume_seqfile_banding_with_mask(
320+
std::string const &filename,
321+
unsigned int num_bands,
322+
unsigned int band,
323+
Hashtable* mask,
324+
unsigned int threshold,
325+
unsigned int &total_reads,
326+
unsigned long long &n_consumed
327+
);
328+
329+
template<typename SeqIO>
330+
void consume_seqfile_banding_with_mask(
331+
read_parsers::ReadParserPtr<SeqIO>& parser,
332+
unsigned int num_bands,
333+
unsigned int band,
334+
Hashtable* mask,
335+
unsigned int threshold,
336+
unsigned int &total_reads,
337+
unsigned long long &n_consumed
338+
);
339+
300340
void set_use_bigcount(bool b)
301341
{
302342
store->set_use_bigcount(b);

src/khmer/_cpy_hashtable.cc

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ PyMethodDef khmer_hashtable_methods[] = {
114114
(PyCFunction)hashtable_consume_seqfile_banding, METH_VARARGS,
115115
"Consume sequences in k-mer banding mode"
116116
},
117+
{
118+
"consume_seqfile_with_mask",
119+
(PyCFunction)hashtable_consume_seqfile_with_mask, METH_VARARGS,
120+
"Consume any k-mers not present in the provided mask"
121+
},
122+
{
123+
"consume_seqfile_banding_with_mask",
124+
(PyCFunction)hashtable_consume_seqfile_banding_with_mask, METH_VARARGS,
125+
"Consume sequences in k-mer banding mode, with a mask"
126+
},
117127
{
118128
"consume_seqfile_with_reads_parser",
119129
(PyCFunction)hashtable_consume_seqfile_with_reads_parser, METH_VARARGS,
@@ -388,6 +398,71 @@ hashtable_consume_seqfile_banding(khmer_KHashtable_Object * me, PyObject * args)
388398
return Py_BuildValue("IK", total_reads, n_consumed);
389399
}
390400

401+
PyObject *
402+
hashtable_consume_seqfile_with_mask(khmer_KHashtable_Object * me, PyObject * args)
403+
{
404+
Hashtable * hashtable = me->hashtable;
405+
406+
const char * filename;
407+
khmer_KHashtable_Object *mask = NULL;
408+
unsigned int threshold = 0;
409+
410+
if (!PyArg_ParseTuple(args, "sO|I", &filename, &mask, &threshold)) {
411+
return NULL;
412+
}
413+
414+
// call the C++ function, and trap signals => Python
415+
unsigned long long n_consumed = 0;
416+
unsigned int total_reads = 0;
417+
try {
418+
hashtable->consume_seqfile_with_mask<FastxReader>(
419+
filename, mask->hashtable, threshold, total_reads, n_consumed
420+
);
421+
} catch (oxli_file_exception &exc) {
422+
PyErr_SetString(PyExc_OSError, exc.what());
423+
return NULL;
424+
} catch (oxli_value_exception &exc) {
425+
PyErr_SetString(PyExc_ValueError, exc.what());
426+
return NULL;
427+
}
428+
429+
return Py_BuildValue("IK", total_reads, n_consumed);
430+
}
431+
432+
PyObject *
433+
hashtable_consume_seqfile_banding_with_mask(khmer_KHashtable_Object * me, PyObject * args)
434+
{
435+
Hashtable * hashtable = me->hashtable;
436+
437+
const char * filename;
438+
unsigned int num_bands;
439+
unsigned int band;
440+
khmer_KHashtable_Object *mask = NULL;
441+
unsigned int threshold = 0;
442+
443+
if (!PyArg_ParseTuple(args, "sIIO|I", &filename, &num_bands, &band, &mask, &threshold)) {
444+
return NULL;
445+
}
446+
447+
// call the C++ function, and trap signals => Python
448+
unsigned long long n_consumed = 0;
449+
unsigned int total_reads = 0;
450+
try {
451+
hashtable->consume_seqfile_banding_with_mask<FastxReader>(
452+
filename, num_bands, band, mask->hashtable, threshold, total_reads,
453+
n_consumed
454+
);
455+
} catch (oxli_file_exception &exc) {
456+
PyErr_SetString(PyExc_OSError, exc.what());
457+
return NULL;
458+
} catch (oxli_value_exception &exc) {
459+
PyErr_SetString(PyExc_ValueError, exc.what());
460+
return NULL;
461+
}
462+
463+
return Py_BuildValue("IK", total_reads, n_consumed);
464+
}
465+
391466
PyObject *
392467
hashtable_consume_seqfile_with_reads_parser(khmer_KHashtable_Object * me,
393468
PyObject * args)

src/oxli/hashtable.cc

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,46 @@ void Hashtable::consume_seqfile_banding(
8787
n_consumed);
8888
}
8989

90+
template<typename SeqIO>
91+
void Hashtable::consume_seqfile_with_mask(
92+
std::string const &filename,
93+
Hashtable* mask,
94+
unsigned int threshold,
95+
unsigned int &total_reads,
96+
unsigned long long &n_consumed
97+
)
98+
{
99+
ReadParserPtr<SeqIO> parser = get_parser<SeqIO>(filename);
100+
consume_seqfile_with_mask<SeqIO>(
101+
parser, mask, threshold, total_reads, n_consumed
102+
);
103+
}
104+
105+
template<typename SeqIO>
106+
void Hashtable::consume_seqfile_banding_with_mask(
107+
std::string const &filename,
108+
unsigned int num_bands,
109+
unsigned int band,
110+
Hashtable* mask,
111+
unsigned int threshold,
112+
unsigned int &total_reads,
113+
unsigned long long &n_consumed
114+
)
115+
{
116+
ReadParserPtr<SeqIO> parser = get_parser<SeqIO>(filename);
117+
consume_seqfile_banding_with_mask<SeqIO>(
118+
parser, num_bands, band, mask, threshold, total_reads, n_consumed
119+
);
120+
}
121+
90122
template<typename SeqIO>
91123
void Hashtable::consume_seqfile(
92124
ReadParserPtr<SeqIO>& parser,
93125
unsigned int &total_reads,
94126
unsigned long long &n_consumed
95127
)
96128
{
97-
Read read;
129+
Read read;
98130

99131
// Iterate through the reads and consume their k-mers.
100132
while (!parser->is_complete( )) {
@@ -114,6 +146,43 @@ void Hashtable::consume_seqfile(
114146

115147
} // consume_seqfile
116148

149+
template<typename SeqIO>
150+
void Hashtable::consume_seqfile_with_mask(
151+
ReadParserPtr<SeqIO>& parser,
152+
Hashtable* mask,
153+
unsigned int threshold,
154+
unsigned int &total_reads,
155+
unsigned long long &n_consumed
156+
)
157+
{
158+
Read read;
159+
160+
// Iterate through the reads and consume their k-mers.
161+
while (!parser->is_complete( )) {
162+
try {
163+
read = parser->get_next_read( );
164+
} catch (NoMoreReadsAvailable) {
165+
break;
166+
}
167+
168+
read.set_clean_seq();
169+
unsigned int this_n_consumed = 0;
170+
KmerHashIteratorPtr kmers = new_kmer_iterator(read.cleaned_seq);
171+
while(!kmers->done()) {
172+
HashIntoType kmer = kmers->next();
173+
if (mask->get_count(kmer) <= threshold) {
174+
count(kmer);
175+
this_n_consumed++;
176+
}
177+
}
178+
179+
__sync_add_and_fetch( &n_consumed, this_n_consumed );
180+
__sync_add_and_fetch( &total_reads, 1 );
181+
182+
} // while reads left for parser
183+
184+
} // consume_seqfile_with_mask
185+
117186
template<typename SeqIO>
118187
void Hashtable::consume_seqfile_banding(
119188
ReadParserPtr<SeqIO>& parser,
@@ -152,6 +221,49 @@ void Hashtable::consume_seqfile_banding(
152221

153222
} // consume_seqfile_banding
154223

224+
template<typename SeqIO>
225+
void Hashtable::consume_seqfile_banding_with_mask(
226+
ReadParserPtr<SeqIO>& parser,
227+
unsigned int num_bands,
228+
unsigned int band,
229+
Hashtable* mask,
230+
unsigned int threshold,
231+
unsigned int &total_reads,
232+
unsigned long long &n_consumed
233+
)
234+
{
235+
Read read;
236+
std::pair<uint64_t, uint64_t> interval = compute_band_interval(num_bands,
237+
band);
238+
std::cerr << "DEBUGGGG threshold=" << threshold << '\n';
239+
240+
while (!parser->is_complete()) {
241+
try {
242+
read = parser->get_next_read( );
243+
} catch (NoMoreReadsAvailable) {
244+
break;
245+
}
246+
247+
read.set_clean_seq();
248+
unsigned int this_n_consumed = 0;
249+
KmerHashIteratorPtr kmers = new_kmer_iterator(read.cleaned_seq);
250+
while(!kmers->done()) {
251+
HashIntoType kmer = kmers->next();
252+
if (kmer >= interval.first && kmer < interval.second) {
253+
if (mask->get_count(kmer) <= threshold) {
254+
count(kmer);
255+
this_n_consumed++;
256+
}
257+
}
258+
}
259+
260+
__sync_add_and_fetch( &n_consumed, this_n_consumed );
261+
__sync_add_and_fetch( &total_reads, 1 );
262+
263+
} // while reads left for parser
264+
265+
} // consume_seqfile_banding_with_mask
266+
155267
//
156268
// consume_string: run through every k-mer in the given string, & hash it.
157269
//
@@ -528,6 +640,44 @@ template void Hashtable::consume_seqfile_banding<FastxReader>(
528640
unsigned long long &n_consumed
529641
);
530642

643+
template void Hashtable::consume_seqfile_with_mask<FastxReader>(
644+
std::string const &filename,
645+
Hashtable* mask,
646+
unsigned int threshold,
647+
unsigned int &total_reads,
648+
unsigned long long &n_consumed
649+
);
650+
651+
652+
template void Hashtable::consume_seqfile_with_mask<FastxReader>(
653+
ReadParserPtr<FastxReader>& parser,
654+
Hashtable* mask,
655+
unsigned int threshold,
656+
unsigned int &total_reads,
657+
unsigned long long &n_consumed
658+
);
659+
660+
template void Hashtable::consume_seqfile_banding_with_mask<FastxReader>(
661+
std::string const &filename,
662+
unsigned int num_bands,
663+
unsigned int bands,
664+
Hashtable* mask,
665+
unsigned int threshold,
666+
unsigned int &total_reads,
667+
unsigned long long &n_consumed
668+
);
669+
670+
671+
template void Hashtable::consume_seqfile_banding_with_mask<FastxReader>(
672+
ReadParserPtr<FastxReader>& parser,
673+
unsigned int num_bands,
674+
unsigned int bands,
675+
Hashtable* mask,
676+
unsigned int threshold,
677+
unsigned int &total_reads,
678+
unsigned long long &n_consumed
679+
);
680+
531681

532682
template uint64_t * Hashtable::abundance_distribution<FastxReader>(
533683
ReadParserPtr<FastxReader>& parser,

tests/test-data/seq-a.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>seq-a
2+
TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA

tests/test-data/seq-b.fa

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>seq-b
2+
ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT

0 commit comments

Comments
 (0)