Skip to content

Commit d5e7618

Browse files
authored
Merge pull request #1756 from dib-lab/fix/document-cqf
[MRG] Document QFCounttable
2 parents ccda093 + da8cf6f commit d5e7618

File tree

4 files changed

+40
-17
lines changed

4 files changed

+40
-17
lines changed

khmer/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@
8686

8787

8888
_buckets_per_byte = {
89+
# calculated by hand from settings in third-part/cqf/gqf.h
90+
'qfcounttable': 1/1.26,
8991
'countgraph': 1,
9092
'smallcountgraph': 2,
9193
'nodegraph': 8,

khmer/_oxli/graphs.pxd

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ cdef extern from "khmer/_cpy_khmer.hh":
3737
cdef extern from "oxli/storage.hh":
3838
cdef cppclass CpStorage "oxli::Storage":
3939
CpStorage()
40-
40+
4141
vector[uint64_t] get_tablesizes()
42-
const size_t n_tables()
42+
const size_t n_tables()
4343
void save(string, WordLength)
4444
void load(string, WordLength&)
4545
const uint64_t n_occupied()
@@ -85,11 +85,11 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli":
8585
void consume_seqfile_banding[SeqIO](shared_ptr[CpReadParser[SeqIO]]&,
8686
uint32_t, uint32_t, uint32_t &, uint64_t &) except +oxli_raise_py_error
8787

88-
void consume_seqfile_banding_with_mask[SeqIO](const string &, uint32_t, uint32_t,
89-
CpHashtable *, uint32_t, uint32_t &,
88+
void consume_seqfile_banding_with_mask[SeqIO](const string &, uint32_t, uint32_t,
89+
CpHashtable *, uint32_t, uint32_t &,
9090
uint64_t &) except +oxli_raise_py_error
9191
void consume_seqfile_banding_with_mask[SeqIO](shared_ptr[CpReadParser[SeqIO]]&,
92-
uint32_t, uint32_t,
92+
uint32_t, uint32_t,
9393
CpHashtable *, uint32_t,
9494
uint32_t &, uint64_t &) except +oxli_raise_py_error
9595

@@ -132,7 +132,7 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli":
132132
CpNodetable(WordLength, vector[uint64_t])
133133

134134
cdef cppclass CpQFCounttable "oxli::QFCounttable" (CpHashtable):
135-
CpQFCounttable(WordLength, int) except +oxli_raise_py_error
135+
CpQFCounttable(WordLength, uint64_t) except +oxli_raise_py_error
136136

137137

138138
cdef extern from "oxli/hashgraph.hh" namespace "oxli":

khmer/_oxli/graphs.pyx

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ cdef CpHashtable * hashtable_arg_shim(object table,
4545
SmallCounttable, QFCounttable)):
4646
cdef CPyHashtable_Object* cpyhashtable
4747
cdef CpHashtable * hashtable
48-
48+
4949
if isinstance(table, allowed):
5050
if isinstance(table, CYTHON_TABLES):
5151
hashtable = (<Hashtable>table).c_table.get()
@@ -226,7 +226,7 @@ cdef class Hashtable:
226226
total_reads,
227227
n_consumed)
228228
return total_reads, n_consumed
229-
229+
230230
def consume_seqfile_banding(self, file_name, num_bands, band):
231231
"""Count all k-mers from file_name."""
232232
cdef unsigned long long n_consumed = 0
@@ -270,7 +270,7 @@ cdef class Hashtable:
270270
"""Calculate the k-mer abundance distribution over reads."""
271271
cdef CpHashtable * cptracking = hashtable_arg_shim(tracking,
272272
allowed=(PyNodegraph, Nodetable))
273-
273+
274274
cdef CPyReadParser_Object* parser
275275
parser = <CPyReadParser_Object*>read_parser
276276
cdef uint64_t * x = deref(self.c_table).abundance_distribution[CpFastxReader](
@@ -311,14 +311,36 @@ cdef class Hashtable:
311311

312312

313313
cdef class QFCounttable(Hashtable):
314-
def __cinit__(self, int k, int starting_size):
315-
# starting size has to be a power of two
316-
power_of_two = ((starting_size & (starting_size - 1) == 0) and
317-
(starting_size != 0))
314+
"""Count kmers using a counting quotient filter.
315+
316+
The counting quotient filter (CQF) is an extension of the quotient filter
317+
that supports counting in addition to simple membership testing. A CQF has
318+
better cache locality compared to (Small)Counttable which increases
319+
performance.
320+
321+
Each new k-mer uses one slot, and the number of slots used per k-mer
322+
increases the more often the same k-mer is entered into the CQF. As a result
323+
the CQF can be "full" and will stop accepting calls to `add` and `count`.
324+
325+
Parameters
326+
----------
327+
k : integer
328+
k-mer size
329+
330+
size : integer
331+
Set the number of slots used by the counting quotient filter. This
332+
determines the amount of memory used and how many k-mers can be entered
333+
into the datastructure. Each slot uses roughly 1.3 bytes.
334+
"""
335+
def __cinit__(self, int k, uint64_t size):
336+
# size has to be a power of two
337+
power_of_two = ((size & (size - 1) == 0) and
338+
(size != 0))
318339
if not power_of_two:
319-
raise ValueError("starting_size has to be a power of two.")
340+
raise ValueError("size has to be a power of two, not"
341+
" {}.".format(size))
320342
if type(self) is QFCounttable:
321-
self.c_table.reset(<CpHashtable*>new CpQFCounttable(k, int(log(starting_size, 2))))
343+
self.c_table.reset(<CpHashtable*>new CpQFCounttable(k, int(log(size, 2))))
322344

323345
@classmethod
324346
def load(cls, file_name):
@@ -346,5 +368,3 @@ cdef class Nodetable(Hashtable):
346368
if type(self) is Nodetable:
347369
primes = get_n_primes_near_x(n_tables, starting_size)
348370
self.c_table.reset(<CpHashtable*>new CpNodetable(k, primes))
349-
350-

tests/test_script_arguments.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ def test_check_tablespace(graph_type, buckets_per_byte):
109109

110110

111111
@pytest.mark.parametrize('graph_type,exp_buckets', [
112+
('qfcounttable', '2.4 million buckets'),
112113
('countgraph', '3.0 million buckets'),
113114
('smallcountgraph', '6.0 million buckets'),
114115
('nodegraph', '24.0 million buckets'),

0 commit comments

Comments
 (0)