|
1 | | -"""Matching functionality for filtering database samples based on field conditions.""" |
| 1 | +"""Dataset query utilities for filtering, ordering, and slicing samples.""" |
2 | 2 |
|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
|
20 | 20 |
|
21 | 21 |
|
22 | 22 | class DatasetQuery: |
23 | | - """Class for executing querying on a dataset.""" |
| 23 | + """Class for executing a query on a dataset. |
| 24 | +
|
| 25 | + # Filtering, ordering, and slicing samples in a dataset |
| 26 | + Allows filtering, ordering, and slicing of samples in a dataset. |
| 27 | + This class can be accessed via calling `.query()` on a Dataset instance. |
| 28 | + ```python |
| 29 | + dataset : Dataset = ... |
| 30 | + query = dataset.query() |
| 31 | + ``` |
| 32 | + The `match()`, `order_by()`, and `slice()` methods can be chained in this order. |
| 33 | + You can also access the methods directly on the Dataset instance: |
| 34 | + ```python |
| 35 | + dataset.match(...) # shorthand for dataset.query().match(...) |
| 36 | + ``` |
| 37 | +
|
| 38 | + ## match() - Filtering samples |
| 39 | + Filtering is done via the `match()` method. |
| 40 | + ```python |
| 41 | + from lightly_studio.core.dataset_query import SampleField |
| 42 | +
|
| 43 | + query_1 = dataset.query().match(SampleField.width > 100) |
| 44 | + query_2 = dataset.query().match(SampleField.tags.contains('cat')) |
| 45 | + ``` |
| 46 | + AND and OR operators are available for combining multiple conditions. |
| 47 | + ```python |
| 48 | + from lightly_studio.core.dataset_query import SampleField, AND, OR |
| 49 | +
|
| 50 | + query = dataset.query().match( |
| 51 | + AND( |
| 52 | + SampleField.height < 200, |
| 53 | + OR( |
| 54 | + SampleField.file_name == 'image.png', |
| 55 | + SampleField.file_name == 'image2.png', |
| 56 | + ) |
| 57 | + ) |
| 58 | + ) |
| 59 | + ``` |
| 60 | +
|
| 61 | + ## order_by() - Ordering samples |
| 62 | + The results can be ordered by using `order_by()`. For tie-breaking, multiple fields |
| 63 | + can be provided. The first field has the highest priority. The default is |
| 64 | + ascending order. To order in descending order, use `OrderByField(...).desc()`. |
| 65 | + ```python |
| 66 | + from lightly_studio.core.dataset_query import OrderByField, SampleField |
| 67 | + query = query.order_by( |
| 68 | + OrderByField(SampleField.width), |
| 69 | + OrderByField(SampleField.file_name).desc() |
| 70 | + ) |
| 71 | + ``` |
| 72 | +
|
| 73 | + ## slice() - Slicing samples |
| 74 | + Slicing can be applied via `slice()` or bracket notation. |
| 75 | + ```python |
| 76 | + query = query.slice(offset=10, limit=20) |
| 77 | + query = query[10:30] # equivalent to slice(offset=10, limit=20) |
| 78 | + ``` |
| 79 | +
|
| 80 | + # Usage of the filtered, ordered and sliced query |
| 81 | +
|
| 82 | + ## Iterating and converting to list |
| 83 | + Finally, the query can be executed by iterating over it or converting to a list. |
| 84 | + ```python |
| 85 | + for sample in query: |
| 86 | + print(sample.file_name) |
| 87 | + samples = query.to_list() |
| 88 | + ``` |
| 89 | + The samples returned are instances of the `Sample` class. |
| 90 | +
|
| 91 | + ## Adding tags to matching samples |
| 92 | + The filtered set can also be used to add a tag to all matching samples. |
| 93 | + ```python |
| 94 | + query.add_tag('my_tag') |
| 95 | + ``` |
| 96 | +
|
| 97 | + ## Selecting a subset of samples using smart selection |
| 98 | + A Selection interface can be created from the current query results. It will only |
| 99 | + select the samples matching the current query at the time of calling selection(). |
| 100 | + ```python |
| 101 | + # Choosing 100 diverse samples from the 'cat' tag. |
| 102 | + # Save them under the tag name "diverse_cats". |
| 103 | + selection = dataset.query().match( |
| 104 | + SampleField.tags.contains('cat') |
| 105 | + ).selection() |
| 106 | + selection.diverse(100, "diverse_cats") |
| 107 | + ``` |
| 108 | +
|
| 109 | + ## Exporting the query results |
| 110 | + An export interface can be created from the current query results. |
| 111 | + ```python |
| 112 | + export = dataset.query().match(...).export() |
| 113 | + export.to_coco_object_detections('/path/to/coco.json') |
| 114 | + ``` |
| 115 | + """ |
24 | 116 |
|
25 | 117 | def __init__(self, dataset: DatasetTable, session: Session) -> None: |
26 | 118 | """Initialize with dataset and database session. |
|
0 commit comments