Skip to content

Commit 39fa610

Browse files
committed
Added dot product everywhere were cosine similarity was used
1 parent 3bd19ef commit 39fa610

File tree

23 files changed

+321
-23
lines changed

23 files changed

+321
-23
lines changed

client-python/elastiknn/api.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class Similarity(Enum):
1717
L1 = 3
1818
L2 = 4
1919
Cosine = 5
20+
Dot = 6
2021

2122

2223
class Vec:
@@ -144,7 +145,24 @@ def to_dict(self):
144145
"k": self.k
145146
}
146147
}
148+
149+
@dataclass(frozen=True)
150+
class DotLsh(Base):
151+
dims: int
152+
L: int
153+
k: int
147154

155+
def to_dict(self):
156+
return {
157+
"type": "elastiknn_dense_float_vector",
158+
"elastiknn": {
159+
"model": "lsh",
160+
"similarity": "dot",
161+
"dims": self.dims,
162+
"L": self.L,
163+
"k": self.k
164+
}
165+
}
148166
@dataclass(frozen=True)
149167
class L2Lsh(Base):
150168
dims: int
@@ -271,6 +289,27 @@ def with_vec(self, vec: Vec.Base):
271289
return NearestNeighborsQuery.CosineLsh(field=self.field, vec=vec, similarity=self.similarity,
272290
candidates=self.candidates)
273291

292+
@dataclass(frozen=True)
293+
class DotLsh(Base):
294+
field: str
295+
vec: Vec.Base
296+
similarity: Similarity = Similarity.Dot
297+
candidates: int = 1000
298+
299+
def to_dict(self):
300+
return {
301+
"field": self.field,
302+
"model": "lsh",
303+
"similarity": self.similarity.name.lower(),
304+
"candidates": self.candidates,
305+
"vec": self.vec.to_dict()
306+
}
307+
308+
def with_vec(self, vec: Vec.Base):
309+
return NearestNeighborsQuery.DotLsh(field=self.field, vec=vec, similarity=self.similarity,
310+
candidates=self.candidates)
311+
312+
274313
@dataclass(frozen=True)
275314
class L2Lsh(Base):
276315
field: str

client-python/elastiknn/models.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
9191
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.L2)
9292
elif self._metric == 'cosine':
9393
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Cosine)
94+
elif self._metric == 'dot':
95+
return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Dot)
9496
elif self._metric == 'jaccard':
9597
return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard)
9698
elif self._metric == 'hamming':
@@ -103,6 +105,9 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh
103105
elif self._metric == 'cosine':
104106
return Mapping.CosineLsh(self._dims, **self._mapping_params), \
105107
NearestNeighborsQuery.CosineLsh(field, dummy, **query_params)
108+
elif self._metric == 'dot':
109+
return Mapping.DotLsh(self._dims, **self._mapping_params), \
110+
NearestNeighborsQuery.DotLsh(field, dummy, **query_params)
106111
elif self._metric == 'hamming':
107112
return Mapping.CosineLsh(self._dims, **self._mapping_params), \
108113
NearestNeighborsQuery.HammingLsh(field, dummy, **query_params)

client-python/elastiknn/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@
1313
('exact', 'l1'),
1414
('exact', 'l2'),
1515
('exact', 'cosine'),
16+
('exact', 'dot'),
1617
('exact', 'hamming'),
1718
('exact', 'jaccard'),
1819
('lsh', 'l2'),
1920
('lsh', 'cosine'),
21+
('lsh', 'dot'),
2022
('lsh', 'jaccard'),
2123
('lsh', 'hamming'),
2224
('permutation_lsh', 'cosine'),

docs/_posts/2021-07-30-how-does-elastiknn-work.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors).
4343
The full list of features (copied from the home page) is as follows:
4444

4545
- Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
46-
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
47-
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
46+
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
47+
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
4848
- Integration of nearest neighbor queries with standard Elasticsearch queries.
4949
- Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index.
5050
- Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.
@@ -88,13 +88,13 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc
8888

8989
Elasticsearch requires non-negative scores, with higher scores indicating higher relevance.
9090

91-
Elastiknn supports five vector similarity functions (L1, L2, Cosine, Jaccard, and Hamming).
91+
Elastiknn supports five vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming).
9292
Three of these are problematic with respect to this scoring requirement.
9393

9494
Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions,
9595
which means that higher relevance (i.e., lower distance) yields _lower_ scores.
9696
Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores.
97-
97+
Dot similarity is defined over $$[-1, 1]$$, and we can't have negative scores, if vectors have a magnitude of 1, then it's equivalent to cosine similarity.
9898
To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements.
9999
The exact transformations are documented [on the API page](/api/#similarity-scoring).
100100

docs/pages/api.md

Lines changed: 67 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,30 @@ PUT /my-index/_mapping
292292
}
293293
}
294294
```
295+
### Dot LSH Mapping
296+
297+
Uses the [Random Projection algorithm](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)
298+
to hash and store dense float vectors such that they support approximate Dot similarity queries. Equivalent to Cosine similarity if the vectors are normalized
299+
300+
The implementation is influenced by Chapter 3 of [Mining Massive Datasets.](http://www.mmds.org/)
301+
302+
```json
303+
PUT /my-index/_mapping
304+
{
305+
"properties": {
306+
"my_vec": {
307+
"type": "elastiknn_dense_float_vector", # 1
308+
"elastiknn": {
309+
"dims": 100, # 2
310+
"model": "lsh", # 3
311+
"similarity": "dot", # 4
312+
"L": 99, # 5
313+
"k": 1 # 6
314+
}
315+
}
316+
}
317+
}
318+
```
295319

296320
|#|Description|
297321
|:--|:--|
@@ -425,7 +449,7 @@ GET /my-index/_search
425449
### Compatibility of Vector Types and Similarities
426450

427451
Jaccard and Hamming similarity only work with sparse bool vectors.
428-
Cosine,[^note-angular-cosine] L1, and L2 similarity only work with dense float vectors.
452+
Cosine,[^note-angular-cosine],Dot[^note-dot-product], L1, and L2 similarity only work with dense float vectors.
429453
The following documentation assume this restriction is known.
430454

431455
These restrictions aren't inherent to the types and algorithms, i.e., you could in theory run cosine similarity on sparse vectors.
@@ -446,9 +470,12 @@ The exact transformations are described below.
446470
|Jaccard|N/A|0|1.0|
447471
|Hamming|N/A|0|1.0|
448472
|Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2|
473+
|Dot[^note-dot-product]|`Dot similarity + 1`|0|2|
449474
|L1|`1 / (1 + l1 distance)`|0|1|
450475
|L2|`1 / (1 + l2 distance)`|0|1|
451476

477+
Dot similirarity will produce negative scores if the vectors are not normalized
478+
452479
If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value.
453480

454481
### Query Vector
@@ -621,6 +648,36 @@ GET /my-index/_search
621648
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
622649
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
623650

651+
### Dot LSH Query
652+
653+
Retrieve dense float vectors based on approximate Cosine similarity.[^note-angular-cosine]
654+
655+
```json
656+
GET /my-index/_search
657+
{
658+
"query": {
659+
"elastiknn_nearest_neighbors": {
660+
"field": "my_vec", # 1
661+
"vec": { # 2
662+
"values": [0.1, 0.2, 0.3, ...]
663+
},
664+
"model": "lsh", # 3
665+
"similarity": "dot", # 4
666+
"candidates": 50 # 5
667+
}
668+
}
669+
}
670+
```
671+
672+
|#|Description|
673+
|:--|:--|
674+
|1|Indexed field. Must use `lsh` mapping model with `dot`[^note-dot-product] similarity.|
675+
|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.|
676+
|3|Model name.|
677+
|4|Similarity function.|
678+
|5|Number of candidates per segment. See the section on LSH Search Strategy.|
679+
|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.|
680+
624681
### L1 LSH Query
625682

626683
Not yet implemented.
@@ -707,12 +764,13 @@ The similarity functions are abbreviated (J: Jaccard, H: Hamming, C: Cosine,[^no
707764

708765
#### elastiknn_dense_float_vector
709766

710-
|Model / Query |Exact |Cosine LSH |L2 LSH |Permutation LSH|
711-
|:-- |:-- |:-- |:-- |:-- |
712-
|Exact (i.e. no model specified) |✔ (C, L1, L2) |x |x |x |
713-
|Cosine LSH |✔ (C, L1, L2) ||x |x |
714-
|L2 LSH |✔ (C, L1, L2) |x ||x |
715-
|Permutation LSH |✔ (C, L1, L2) |x |x ||
767+
|Model / Query |Exact |Cosine LSH |Dot LSH|L2 LSH |Permutation LSH|
768+
|:-- |:-- |:-- |:-- |:-- |:-- |
769+
|Exact (i.e. no model specified) |✔ (C, D, L1, L2) |x |x |x |x |
770+
|Cosine LSH |✔ (C, D, L1, L2) |||x |x |
771+
|Dot LSH |✔ (C, D, L1, L2) |||x |x |
772+
|L2 LSH |✔ (C, D, L1, L2) |x |x ||x |
773+
|Permutation LSH |✔ (C, D, L1, L2) |x |x |x ||
716774

717775
### Running Nearest Neighbors Query on a Filtered Subset of Documents
718776

@@ -860,4 +918,5 @@ PUT /my-index
860918

861919
See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details.
862920

863-
[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally.
921+
[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally.
922+
[^note-dot-product]: Dot product is thought to be used with normalized vectors V, meaning that ||v||==1.

docs/pages/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ This enables users to combine traditional queries (e.g., "some product") with ve
1515
## Features
1616

1717
- Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document.
18-
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
19-
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity.
18+
- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance).
19+
- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity.
2020
- Integration of nearest neighbor queries with standard Elasticsearch queries.
2121
- Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index.
2222
- Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch.

elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ object Mapping {
1515

1616
final case class CosineLsh(dims: Int, L: Int, k: Int) extends Mapping
1717

18+
final case class DotLsh(dims: Int, L: Int, k: Int) extends Mapping
19+
1820
final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping
1921

2022
final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping

elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ object NearestNeighborsQuery {
2929
override def similarity: Similarity = Similarity.Cosine
3030
}
3131

32+
final case class DotLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
33+
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
34+
35+
override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates)
36+
37+
override def similarity: Similarity = Similarity.Dot
38+
}
39+
3240
final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
3341
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
3442

elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ sealed trait Similarity
55
object Similarity {
66
case object Cosine extends Similarity
77

8+
case object Dot extends Similarity
9+
810
case object Hamming extends Similarity
911

1012
case object Jaccard extends Similarity
@@ -13,5 +15,5 @@ object Similarity {
1315

1416
case object L2 extends Similarity
1517

16-
val values: Seq[Similarity] = Vector(Cosine, Jaccard, Hamming, L1, L2)
18+
val values: Seq[Similarity] = Vector(Cosine, Dot, Jaccard, Hamming, L1, L2)
1719
}

0 commit comments

Comments
 (0)