From 27c398ee4d57a2dde9a3ad2226dfd95ca91293db Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 11:43:33 +0200 Subject: [PATCH 01/11] Added dot product everywhere were cosine similarity was used --- client-python/elastiknn/api.py | 39 ++++++++++ client-python/elastiknn/models.py | 5 ++ client-python/elastiknn/utils.py | 2 + .../2021-07-30-how-does-elastiknn-work.md | 8 +- docs/pages/api.md | 75 +++++++++++++++++-- docs/pages/index.md | 4 +- .../com/klibisz/elastiknn/api/Mapping.scala | 2 + .../elastiknn/api/NearestNeighborsQuery.scala | 8 ++ .../klibisz/elastiknn/api/Similarity.scala | 2 +- .../klibisz/elastiknn/api/XContentCodec.scala | 39 ++++++++++ .../elastiknn/api/XContentCodecSuite.scala | 9 ++- .../klibisz/elastiknn/models/DotLshModel.java | 53 +++++++++++++ .../klibisz/elastiknn/models/ExactModel.java | 5 ++ .../vectors/DefaultFloatVectorOps.java | 4 + .../elastiknn/vectors/FloatVectorOps.java | 2 + .../vectors/PanamaFloatVectorOps.java | 4 + .../elastiknn/models/DotLshModelSuite.scala | 29 +++++++ .../elastiknn/mapper/VectorMapper.scala | 1 + .../models/ExactSimilarityFunction.scala | 5 ++ .../klibisz/elastiknn/models/ModelCache.scala | 2 + .../query/ElastiknnQueryBuilder.scala | 16 +++- .../models/ExactSimilarityFunctionSuite.scala | 24 ++++++ .../models/ExactSimilarityReference.scala | 4 +- 23 files changed, 319 insertions(+), 23 deletions(-) create mode 100644 elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java create mode 100644 elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala diff --git a/client-python/elastiknn/api.py b/client-python/elastiknn/api.py index 4a5c3671b..d933779b3 100644 --- a/client-python/elastiknn/api.py +++ b/client-python/elastiknn/api.py @@ -17,6 +17,7 @@ class Similarity(Enum): L1 = 3 L2 = 4 Cosine = 5 + Dot = 6 class Vec: @@ -144,7 +145,24 @@ def to_dict(self): "k": self.k } } + + @dataclass(frozen=True) + class DotLsh(Base): + dims: int + L: int + k: int + def to_dict(self): + return { + "type": "elastiknn_dense_float_vector", + "elastiknn": { + "model": "lsh", + "similarity": "dot", + "dims": self.dims, + "L": self.L, + "k": self.k + } + } @dataclass(frozen=True) class L2Lsh(Base): dims: int @@ -271,6 +289,27 @@ def with_vec(self, vec: Vec.Base): return NearestNeighborsQuery.CosineLsh(field=self.field, vec=vec, similarity=self.similarity, candidates=self.candidates) + @dataclass(frozen=True) + class DotLsh(Base): + field: str + vec: Vec.Base + similarity: Similarity = Similarity.Dot + candidates: int = 1000 + + def to_dict(self): + return { + "field": self.field, + "model": "lsh", + "similarity": self.similarity.name.lower(), + "candidates": self.candidates, + "vec": self.vec.to_dict() + } + + def with_vec(self, vec: Vec.Base): + return NearestNeighborsQuery.DotLsh(field=self.field, vec=vec, similarity=self.similarity, + candidates=self.candidates) + + @dataclass(frozen=True) class L2Lsh(Base): field: str diff --git a/client-python/elastiknn/models.py b/client-python/elastiknn/models.py index 03291076e..410b582c3 100644 --- a/client-python/elastiknn/models.py +++ b/client-python/elastiknn/models.py @@ -91,6 +91,8 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.L2) elif self._metric == 'cosine': return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Cosine) + elif self._metric == 'dot': + return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Dot) elif self._metric == 'jaccard': return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard) elif self._metric == 'hamming': @@ -103,6 +105,9 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh elif self._metric == 'cosine': return Mapping.CosineLsh(self._dims, **self._mapping_params), \ NearestNeighborsQuery.CosineLsh(field, dummy, **query_params) + elif self._metric == 'dot': + return Mapping.DotLsh(self._dims, **self._mapping_params), \ + NearestNeighborsQuery.DotLsh(field, dummy, **query_params) elif self._metric == 'hamming': return Mapping.CosineLsh(self._dims, **self._mapping_params), \ NearestNeighborsQuery.HammingLsh(field, dummy, **query_params) diff --git a/client-python/elastiknn/utils.py b/client-python/elastiknn/utils.py index dbba8b32d..897569536 100644 --- a/client-python/elastiknn/utils.py +++ b/client-python/elastiknn/utils.py @@ -13,10 +13,12 @@ ('exact', 'l1'), ('exact', 'l2'), ('exact', 'cosine'), + ('exact', 'dot'), ('exact', 'hamming'), ('exact', 'jaccard'), ('lsh', 'l2'), ('lsh', 'cosine'), + ('lsh', 'dot'), ('lsh', 'jaccard'), ('lsh', 'hamming'), ('permutation_lsh', 'cosine'), diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md index 1192aa7f2..8ffe37d74 100644 --- a/docs/_posts/2021-07-30-how-does-elastiknn-work.md +++ b/docs/_posts/2021-07-30-how-does-elastiknn-work.md @@ -43,8 +43,8 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors). The full list of features (copied from the home page) is as follows: - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). -- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity. +- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index. - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch. @@ -88,13 +88,13 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc Elasticsearch requires non-negative scores, with higher scores indicating higher relevance. -Elastiknn supports five vector similarity functions (L1, L2, Cosine, Jaccard, and Hamming). +Elastiknn supports five vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming). Three of these are problematic with respect to this scoring requirement. Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions, which means that higher relevance (i.e., lower distance) yields _lower_ scores. Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores. - +Dot similarity is defined over $$[-1, 1]$$, and we can't have negative scores, if vectors have a magnitude of 1, then it's equivalent to cosine similarity. To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements. The exact transformations are documented [on the API page](/api/#similarity-scoring). diff --git a/docs/pages/api.md b/docs/pages/api.md index c102c7fd2..3c7e1369b 100644 --- a/docs/pages/api.md +++ b/docs/pages/api.md @@ -292,6 +292,30 @@ PUT /my-index/_mapping } } ``` +### Dot LSH Mapping + +Uses the [Random Projection algorithm](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection) +to hash and store dense float vectors such that they support approximate Dot similarity queries. Equivalent to Cosine similarity if the vectors are normalized + +The implementation is influenced by Chapter 3 of [Mining Massive Datasets.](http://www.mmds.org/) + +```json +PUT /my-index/_mapping +{ + "properties": { + "my_vec": { + "type": "elastiknn_dense_float_vector", # 1 + "elastiknn": { + "dims": 100, # 2 + "model": "lsh", # 3 + "similarity": "dot", # 4 + "L": 99, # 5 + "k": 1 # 6 + } + } + } +} +``` |#|Description| |:--|:--| @@ -425,7 +449,7 @@ GET /my-index/_search ### Compatibility of Vector Types and Similarities Jaccard and Hamming similarity only work with sparse bool vectors. -Cosine,[^note-angular-cosine] L1, and L2 similarity only work with dense float vectors. +Cosine,[^note-angular-cosine],Dot[^note-dot-product], L1, and L2 similarity only work with dense float vectors. The following documentation assume this restriction is known. These restrictions aren't inherent to the types and algorithms, i.e., you could in theory run cosine similarity on sparse vectors. @@ -446,9 +470,12 @@ The exact transformations are described below. |Jaccard|N/A|0|1.0| |Hamming|N/A|0|1.0| |Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2| +|Dot[^note-dot-product]|`Dot similarity + 1`|0|2| |L1|`1 / (1 + l1 distance)`|0|1| |L2|`1 / (1 + l2 distance)`|0|1| +Dot similirarity will produce negative scores if the vectors are not normalized + If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value. ### Query Vector @@ -621,6 +648,36 @@ GET /my-index/_search |5|Number of candidates per segment. See the section on LSH Search Strategy.| |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.| +### Dot LSH Query + +Retrieve dense float vectors based on approximate Cosine similarity.[^note-angular-cosine] + +```json +GET /my-index/_search +{ + "query": { + "elastiknn_nearest_neighbors": { + "field": "my_vec", # 1 + "vec": { # 2 + "values": [0.1, 0.2, 0.3, ...] + }, + "model": "lsh", # 3 + "similarity": "dot", # 4 + "candidates": 50 # 5 + } + } +} +``` + +|#|Description| +|:--|:--| +|1|Indexed field. Must use `lsh` mapping model with `dot`[^note-dot-product] similarity.| +|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.| +|3|Model name.| +|4|Similarity function.| +|5|Number of candidates per segment. See the section on LSH Search Strategy.| +|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.| + ### L1 LSH Query Not yet implemented. @@ -707,12 +764,13 @@ The similarity functions are abbreviated (J: Jaccard, H: Hamming, C: Cosine,[^no #### elastiknn_dense_float_vector -|Model / Query |Exact |Cosine LSH |L2 LSH |Permutation LSH| -|:-- |:-- |:-- |:-- |:-- | -|Exact (i.e. no model specified) |✔ (C, L1, L2) |x |x |x | -|Cosine LSH |✔ (C, L1, L2) |✔ |x |x | -|L2 LSH |✔ (C, L1, L2) |x |✔ |x | -|Permutation LSH |✔ (C, L1, L2) |x |x |✔ | +|Model / Query |Exact |Cosine LSH |Dot LSH|L2 LSH |Permutation LSH| +|:-- |:-- |:-- |:-- |:-- |:-- | +|Exact (i.e. no model specified) |✔ (C, D, L1, L2) |x |x |x |x | +|Cosine LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x | +|Dot LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x | +|L2 LSH |✔ (C, D, L1, L2) |x |x |✔ |x | +|Permutation LSH |✔ (C, D, L1, L2) |x |x |x |✔ | ### Running Nearest Neighbors Query on a Filtered Subset of Documents @@ -860,4 +918,5 @@ PUT /my-index See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details. -[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. \ No newline at end of file +[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. +[^note-dot-product]: Dot product is thought to be used with normalized vectors V, meaning that ||v||==1. \ No newline at end of file diff --git a/docs/pages/index.md b/docs/pages/index.md index 44a049161..dd755fd63 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -15,8 +15,8 @@ This enables users to combine traditional queries (e.g., "some product") with ve ## Features - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). -- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity. +- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index. - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch. diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala index 32752bfde..695253371 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala @@ -15,6 +15,8 @@ object Mapping { final case class CosineLsh(dims: Int, L: Int, k: Int) extends Mapping + final case class DotLsh(dims: Int, L: Int, k: Int) extends Mapping + final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala index 0157be03c..f6a76bbbb 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala @@ -29,6 +29,14 @@ object NearestNeighborsQuery { override def similarity: Similarity = Similarity.Cosine } + final case class DotLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery { + override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v) + + override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates) + + override def similarity: Similarity = Similarity.Dot + } + final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery { override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v) diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala index b8c2c9440..8df6a9067 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala @@ -1,5 +1,5 @@ package com.klibisz.elastiknn.api enum Similarity { - case Cosine, Hamming, Jaccard, L1, L2 + case Cosine, Dot, Hamming, Jaccard, L1, L2 } diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala index 883a2df66..69a1d4070 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala @@ -89,6 +89,7 @@ object XContentCodec { case Similarity.L1 => b.value(Names.L1) case Similarity.L2 => b.value(Names.L2) case Similarity.Cosine => b.value(Names.COSINE) + case Similarity.Dot => b.value(Names.DOT) } () } @@ -206,6 +207,20 @@ object XContentCodec { } } + implicit val dotLshMapping: MappingEncoder[Mapping.DotLsh] = new MappingEncoder[Mapping.DotLsh] { + override protected def vectorType: String = Names.EKNN_DENSE_FLOAT_VECTOR + override def encodeElastiknnObject(t: Mapping.DotLsh, b: XContentBuilder): Unit = { + b.startObject(Names.ELASTIKNN) + b.field(Names.LSH_L, t.L) + b.field(Names.DIMS, t.dims) + b.field(Names.LSH_K, t.k) + b.field(Names.MODEL, Names.LSH) + b.field(Names.SIMILARITY, Names.DOT) + b.endObject() + () + } + } + implicit val l2LshMapping: MappingEncoder[Mapping.L2Lsh] = new MappingEncoder[Mapping.L2Lsh] { override protected def vectorType: String = Names.EKNN_DENSE_FLOAT_VECTOR override def encodeElastiknnObject(t: Mapping.L2Lsh, b: XContentBuilder): Unit = { @@ -242,6 +257,7 @@ object XContentCodec { case m: Mapping.HammingLsh => hammingLshMapping.encodeElastiknnObject(m, b) case m: Mapping.DenseFloat => denseFloatMapping.encodeElastiknnObject(m, b) case m: Mapping.CosineLsh => cosineLshMapping.encodeElastiknnObject(m, b) + case m: Mapping.DotLsh => dotLshMapping.encodeElastiknnObject(m, b) case m: Mapping.L2Lsh => l2LshMapping.encodeElastiknnObject(m, b) case m: Mapping.PermutationLsh => permutationLshMapping.encodeElastiknnObject(m, b) } @@ -252,6 +268,7 @@ object XContentCodec { case m: Mapping.HammingLsh => hammingLshMapping.encodeUnsafe(m, b) case m: Mapping.DenseFloat => denseFloatMapping.encodeUnsafe(m, b) case m: Mapping.CosineLsh => cosineLshMapping.encodeUnsafe(m, b) + case m: Mapping.DotLsh => dotLshMapping.encodeUnsafe(m, b) case m: Mapping.L2Lsh => l2LshMapping.encodeUnsafe(m, b) case m: Mapping.PermutationLsh => permutationLshMapping.encodeUnsafe(m, b) } @@ -316,6 +333,21 @@ object XContentCodec { } } + implicit val dotLshQuery: Encoder[NearestNeighborsQuery.DotLsh] = new Encoder[NearestNeighborsQuery.DotLsh] { + override def encodeUnsafe(t: NearestNeighborsQuery.DotLsh, b: XContentBuilder): Unit = { + b.startObject() + b.field(Names.CANDIDATES, t.candidates) + b.field(Names.FIELD, t.field) + b.field(Names.MODEL, Names.LSH) + b.field(Names.SIMILARITY) + similarity.encodeUnsafe(t.similarity, b) + b.field(Names.VEC) + vec.encodeUnsafe(t.vec, b) + b.endObject() + () + } + } + implicit val l2LshQuery: Encoder[NearestNeighborsQuery.L2Lsh] = new Encoder[NearestNeighborsQuery.L2Lsh] { override def encodeUnsafe(t: NearestNeighborsQuery.L2Lsh, b: XContentBuilder): Unit = { b.startObject() @@ -354,6 +386,7 @@ object XContentCodec { case q: NearestNeighborsQuery.JaccardLsh => jaccardLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.HammingLsh => hammingLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.CosineLsh => cosineLshQuery.encodeUnsafe(q, b) + case q: NearestNeighborsQuery.DotLsh => dotLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.L2Lsh => l2LshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.PermutationLsh => permutationLshQuery.encodeUnsafe(q, b) } @@ -441,6 +474,7 @@ object XContentCodec { case Names.L1 => Similarity.L1 case Names.L2 => Similarity.L2 case Names.COSINE => Similarity.Cosine + case Names.DOT => Similarity.Dot case Names.ANGULAR => Similarity.Cosine case _ => throw new XContentParseException(unexpectedValue(s1, Names.SIMILARITIES)) } @@ -603,6 +637,8 @@ object XContentCodec { Mapping.L2Lsh(dims, l, k, w) case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.LSH), Some(dims), Some(Similarity.Cosine), Some(l), Some(k), _, _) => Mapping.CosineLsh(dims, l, k) + case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.LSH), Some(dims), Some(Similarity.Dot), Some(l), Some(k), _, _) => + Mapping.DotLsh(dims, l, k) case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.PERMUTATION_LSH), Some(dims), _, _, Some(k), _, Some(repeating)) => Mapping.PermutationLsh(dims, k, repeating) case _ => throw new XContentParseException(unableToConstruct("mapping")) @@ -645,6 +681,8 @@ object XContentCodec { NearestNeighborsQuery.Exact(field, similarity, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Cosine), Some(v)) => NearestNeighborsQuery.CosineLsh(field, candidates, v) + case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Dot), Some(v)) => + NearestNeighborsQuery.DotLsh(field, candidates, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Hamming), Some(v)) => NearestNeighborsQuery.HammingLsh(field, candidates, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Jaccard), Some(v)) => @@ -662,6 +700,7 @@ object XContentCodec { val ANGULAR = "angular" val CANDIDATES = "candidates" val COSINE = "cosine" + val DOT = "dot" val DIMS = "dims" val ELASTIKNN = "elastiknn" val EKNN_DENSE_FLOAT_VECTOR = s"${ELASTIKNN_NAME}_dense_float_vector" diff --git a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala index 7f15375b6..177abcfa7 100644 --- a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala +++ b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala @@ -110,13 +110,16 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { ("L2", Similarity.L2), ("cosine", Similarity.Cosine), ("Cosine", Similarity.Cosine), - ("COSINE", Similarity.Cosine) + ("COSINE", Similarity.Cosine), + ("dot", Similarity.Dot), + ("Dot", Similarity.Dot), + ("DOT", Similarity.Dot) ) } roundtrip[Similarity](Json.fromString(str.toLowerCase), sim) } "errors" in { val ex1 = intercept[XContentParseException](decodeUnsafeFromString[Similarity]("\"wrong\"")) - ex1.getMessage shouldBe "Expected token to be one of [cosine,hamming,jaccard,l1,l2] but found [wrong]" + ex1.getMessage shouldBe "Expected token to be one of [cosine,dot,hamming,jaccard,l1,l2] but found [wrong]" val ex2 = intercept[XContentParseException](decodeUnsafeFromString[Similarity]("99")) ex2.getMessage shouldBe "Expected token to be one of [VALUE_STRING] but found [VALUE_NUMBER]" } @@ -326,7 +329,7 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { | } |} |""".stripMargin)) - ex2.getMessage shouldBe "Expected token to be one of [cosine,hamming,jaccard,l1,l2] but found [jacard]" + ex2.getMessage shouldBe "Expected token to be one of [cosine,dot, hamming,jaccard,l1,l2] but found [jacard]" } } "HammingLsh" - { diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java new file mode 100644 index 000000000..b30898d1d --- /dev/null +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java @@ -0,0 +1,53 @@ +package com.klibisz.elastiknn.models; + +import com.klibisz.elastiknn.storage.BitBuffer; +import com.klibisz.elastiknn.vectors.FloatVectorOps; + +import static com.klibisz.elastiknn.storage.ByteBufferSerialization.writeInt; + +import java.util.Random; + +public class DotLshModel implements HashingModel.DenseFloat { + + private final int L; + private final int k; + private final float[][] planes; + + private final FloatVectorOps vectorOps; + + /** + * Locality sensitive hashing model for Dot similarity. + * Uses the random hyperplanes method described in Mining Massive Datasets chapter 3. + * @param dims length of the vectors hashed by this model + * @param L number of hash tables + * @param k number of hash functions concatenated to form a hash for each table + * @param rng random number generator used to instantiate model parameters + */ + public DotLshModel(int dims, int L, int k, Random rng, FloatVectorOps vectorOps) { + this.L = L; + this.k = k; + this.planes = new float[L * k][dims]; + this.vectorOps = vectorOps; + for (int i = 0; i < this.planes.length; i++) { + for (int j = 0; j < dims; j++) { + this.planes[i][j] = (float) rng.nextGaussian(); + } + } + } + + @Override + public HashAndFreq[] hash(float[] values) { + HashAndFreq[] hashes = new HashAndFreq[L]; + for (int ixL = 0; ixL < L; ixL++) { + BitBuffer.IntBuffer buf = new BitBuffer.IntBuffer(writeInt(ixL)); + for (int ixk = 0; ixk < k; ixk++) { + double dot = vectorOps.dotProduct(planes[ixL * k + ixk], values); + if (dot > 0) buf.putOne(); + else buf.putZero(); + } + hashes[ixL] = HashAndFreq.once(buf.toByteArray()); + } + return hashes; + } + +} diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java index fb23f7d2d..39cee7d9a 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java @@ -38,4 +38,9 @@ public static double l1Similarity(FloatVectorOps floatVectorOps, float[] v1, flo public static double cosineSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { return 1 + floatVectorOps.cosineSimilarity(v1, v2); } + + @ForceInline + public static double dotSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { + return 1 + floatVectorOps.dotSimilarity(v1, v2); + } } diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java index 95809a482..93e5362e5 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java @@ -19,6 +19,10 @@ public double cosineSimilarity(float[] v1, float[] v2) { else return -1; } + public double dotSimilarity(float[] v1, float[] v2) { + return dotProduct(v1, v2); + } + public double dotProduct(float[] v1, float[] v2) { float dotProd = 0f; for (int i = 0; i < v1.length; i++) dotProd += v1[i] * v2[i]; diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java index 485bca4f6..ccda505c8 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java @@ -9,4 +9,6 @@ public interface FloatVectorOps { double l1Distance(float[] v1, float[] v2); double cosineSimilarity(float[] v1, float[] v2); + + double dotSimilarity(float[] v1, float[] v2); } diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java index da065283b..98c36362f 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java @@ -49,6 +49,10 @@ public double cosineSimilarity(float[] v1, float[] v2) { else return -1; } + public double dotSimilarity(float[] v1, float[] v2) { + return dotProduct(v1, v2); + } + public double dotProduct(float[] v1, float[] v2) { int i = 0; double dotProd = 0d; diff --git a/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala b/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala new file mode 100644 index 000000000..2bb4cae59 --- /dev/null +++ b/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala @@ -0,0 +1,29 @@ +package com.klibisz.elastiknn.models + +import com.klibisz.elastiknn.api.Vec +import com.klibisz.elastiknn.vectors.PanamaFloatVectorOps +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +import scala.util.Random + +class DotLshModelSuite extends AnyFunSuite with Matchers { + + test("model is dependent of vector magnitude but hashing should not") { + implicit val rng: Random = new Random(0) + val dims = 10 + for { + l <- 1 to 100 by 10 + k <- 1 to 5 + isUnit <- Seq(true, false) + } { + val mlsh = new DotLshModel(dims, l, k, new java.util.Random(0), new PanamaFloatVectorOps) + val vec = Vec.DenseFloat.random(dims, unit = isUnit) + val scaled = (1 to 10).map(m => vec.copy(vec.values.map(_ * m))) + val hashed = scaled.map(v => mlsh.hash(v.values).toList) + scaled.distinct.length shouldBe 10 + hashed.distinct.length shouldBe 1 + } + } + +} diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala index 742112615..ea522549b 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala @@ -53,6 +53,7 @@ object VectorMapper { mapping match { case Mapping.DenseFloat(_) => Try(Seq(ExactQuery.index(field, vec))) case m: Mapping.CosineLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) + case m: Mapping.DotLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case m: Mapping.L2Lsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case m: Mapping.PermutationLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case _ => Failure(incompatible(mapping, vec)) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala index 269c8012b..fe14af868 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala @@ -34,4 +34,9 @@ object ExactSimilarityFunction { override def apply(v1: Vec.DenseFloat, v2: StoredVec.DenseFloat): Double = ExactModel.cosineSimilarity(floatVectorOps, v1.values, v2.values) } + final class Dot(floatVectorOps: FloatVectorOps) extends ExactSimilarityFunction[Vec.DenseFloat, StoredVec.DenseFloat] { + override def maxScore: Float = 2f + override def apply(v1: Vec.DenseFloat, v2: StoredVec.DenseFloat): Double = + ExactModel.dotSimilarity(floatVectorOps, v1.values, v2.values) + } } diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala index 7d8882c54..41c7b7146 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala @@ -15,12 +15,14 @@ final class ModelCache(floatVectorOps: FloatVectorOps) { }) private val cosine = cache((m: Mapping.CosineLsh) => new CosineLshModel(m.dims, m.L, m.k, new Random(0), floatVectorOps)) + private val dot = cache((m: Mapping.DotLsh) => new DotLshModel(m.dims, m.L, m.k, new Random(0), floatVectorOps)) private val jaccard = cache((m: Mapping.JaccardLsh) => new JaccardLshModel(m.L, m.k, new Random(0))) private val hamming = cache((m: Mapping.HammingLsh) => new HammingLshModel(m.dims, m.L, m.k, new Random(0))) private val l2 = cache((m: Mapping.L2Lsh) => new L2LshModel(m.dims, m.L, m.k, m.w, new Random(0), floatVectorOps)) private val permutation = cache((m: Mapping.PermutationLsh) => new PermutationLshModel(m.k, m.repeating)) def apply(m: Mapping.CosineLsh): CosineLshModel = cosine.get(m) + def apply(m: Mapping.DotLsh): DotLshModel = dot.get(m) def apply(m: Mapping.JaccardLsh): JaccardLshModel = jaccard.get(m) def apply(m: Mapping.HammingLsh): HammingLshModel = hamming.get(m) def apply(m: Mapping.L2Lsh): L2LshModel = l2.get(m) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala index ab100a684..9c488e189 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala @@ -51,21 +51,26 @@ final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: Mo case ( Exact(f, Similarity.L1, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh |_: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, l1) case ( Exact(f, Similarity.L2, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, l2) case ( Exact(f, Similarity.Cosine, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, cosine) + case ( + Exact(f, Similarity.Dot, v: Vec.DenseFloat), + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + ) => + new ExactQuery(f, v, dot) case (JaccardLsh(f, candidates, v: Vec.SparseBool), m: Mapping.JaccardLsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.trueIndices, v.totalIndices), ESF.Jaccard) @@ -75,7 +80,10 @@ final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: Mo case (CosineLsh(f, candidates, v: Vec.DenseFloat), m: Mapping.CosineLsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.values), cosine) - + + case (DotLsh(f, candidates, v: Vec.DenseFloat), m: Mapping.DotLsh) => + new HashingQuery(f, v, candidates, modelCache(m).hash(v.values), dot) + case (L2Lsh(f, candidates, probes, v: Vec.DenseFloat), m: Mapping.L2Lsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.values, probes), l2) diff --git a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala index ee813ddc1..96a205d63 100644 --- a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala +++ b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala @@ -93,7 +93,31 @@ class ExactSimilarityFunctionSuite extends AnyFunSpec with Matchers { cosine(v2, v2) shouldBe 2d } } + + describe("Dot Similarity") { + val dot = new ExactSimilarityFunction.Dot(new PanamaFloatVectorOps) + + it("matches reference") { + for (_ <- 0 until reps) { + val len = rng.nextInt(4096) + 10 + val v1 = Vec.DenseFloat.random(len) + val v2 = Vec.DenseFloat.random(len) + dot(v1, v2) shouldBe (ExactSimilarityReference.Dot(v1, v2) +- tol) + } + } + + it("handles identity") { + val v1 = Vec.DenseFloat.random(199) + dot(v1, v1) shouldBe (2d +- tol) + } + + it("handles all zeros") { + val v1 = Vec.DenseFloat.random(199) + val v2 = Vec.DenseFloat(v1.values.map(_ * 0)) + dot(v2, v2) shouldBe 2d + } + } describe("Jaccard Similarity") { it("matches reference") { diff --git a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala index 1c8724de7..2b302b894 100644 --- a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala +++ b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala @@ -19,7 +19,9 @@ object ExactSimilarityReference { val Cosine: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { 1 + (1 - cosineDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) } - + val Dot: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { + 1 + (1 - dotDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) + } val Hamming: (Vec.SparseBool, Vec.SparseBool) => Double = (v1: Vec.SparseBool, v2: Vec.SparseBool) => { val d1 = new Array[Boolean](v1.totalIndices) val d2 = new Array[Boolean](v2.totalIndices) From 9b6094268042595db9481f7b529c77b39c01e55c Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 16:38:52 +0200 Subject: [PATCH 02/11] Found some bugs when trying to build/run tests --- .../src/test/scala/com/klibisz/elastiknn/RecallSuite.scala | 1 + .../com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala | 2 ++ .../com/klibisz/elastiknn/models/ExactSimilarityReference.scala | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala index 123905234..600b128f7 100644 --- a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala +++ b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala @@ -196,6 +196,7 @@ class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient wi case Similarity.L1 => denseFloatTestData case Similarity.L2 => denseFloatTestData case Similarity.Cosine => denseFloatUnitTestData + case Similarity.Dot => denseFloatUnitTestData } } { val uuid = UUID.randomUUID().toString diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala index 9c488e189..1958bc3d7 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala @@ -12,6 +12,7 @@ import org.elasticsearch.index.query.SearchExecutionContext final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: ModelCache) { private val cosine = new ESF.Cosine(floatVectorOps) + private val dot = new ESF.Dot(floatVectorOps) private val l1 = new ESF.L1(floatVectorOps) private val l2 = new ESF.L2(floatVectorOps) @@ -66,6 +67,7 @@ final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: Mo _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, cosine) + case ( Exact(f, Similarity.Dot, v: Vec.DenseFloat), _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh diff --git a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala index 2b302b894..aac39115d 100644 --- a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala +++ b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala @@ -20,7 +20,7 @@ object ExactSimilarityReference { 1 + (1 - cosineDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) } val Dot: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { - 1 + (1 - dotDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) + 1 + ( new DenseVector(v1.values.map(_.toDouble)) dot new DenseVector(v2.values.map(_.toDouble))) } val Hamming: (Vec.SparseBool, Vec.SparseBool) => Double = (v1: Vec.SparseBool, v2: Vec.SparseBool) => { val d1 = new Array[Boolean](v1.totalIndices) From 2e330ace6139abc7f334ad350f84f7543329bed7 Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:36:50 +0200 Subject: [PATCH 03/11] Update docs/_posts/2021-07-30-how-does-elastiknn-work.md Co-authored-by: Alex Klibisz --- docs/_posts/2021-07-30-how-does-elastiknn-work.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md index 8ffe37d74..ff151c9de 100644 --- a/docs/_posts/2021-07-30-how-does-elastiknn-work.md +++ b/docs/_posts/2021-07-30-how-does-elastiknn-work.md @@ -43,7 +43,7 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors). The full list of features (copied from the home page) is as follows: - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). - Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index. From 0a6b2da13c3b970eebead1bbeedf0c9f790aca5c Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:36:59 +0200 Subject: [PATCH 04/11] Update docs/_posts/2021-07-30-how-does-elastiknn-work.md Co-authored-by: Alex Klibisz --- docs/_posts/2021-07-30-how-does-elastiknn-work.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md index ff151c9de..30ea7a557 100644 --- a/docs/_posts/2021-07-30-how-does-elastiknn-work.md +++ b/docs/_posts/2021-07-30-how-does-elastiknn-work.md @@ -88,7 +88,7 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc Elasticsearch requires non-negative scores, with higher scores indicating higher relevance. -Elastiknn supports five vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming). +Elastiknn supports six vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming). Three of these are problematic with respect to this scoring requirement. Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions, From 6af15cce6238d7f44064cacedd4166fe1f8395cb Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:52:19 +0200 Subject: [PATCH 05/11] Update docs/pages/index.md Co-authored-by: Alex Klibisz --- docs/pages/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pages/index.md b/docs/pages/index.md index dd755fd63..19926776c 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -15,7 +15,7 @@ This enables users to combine traditional queries (e.g., "some product") with ve ## Features - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Exact nearest neighbor queries for six similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). - Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index. From 6ffc72513d4fe4403ed8066e02bac3b6959f6dc5 Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:52:46 +0200 Subject: [PATCH 06/11] Update docs/pages/api.md Co-authored-by: Alex Klibisz --- docs/pages/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pages/api.md b/docs/pages/api.md index 3c7e1369b..dfe1758fa 100644 --- a/docs/pages/api.md +++ b/docs/pages/api.md @@ -650,7 +650,7 @@ GET /my-index/_search ### Dot LSH Query -Retrieve dense float vectors based on approximate Cosine similarity.[^note-angular-cosine] +Retrieve dense float vectors based on approximate Dot similarity.[^note-dot-cosine] ```json GET /my-index/_search From 6336bf312f1439632f7b18b40cefa1f123fbd19f Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:53:41 +0200 Subject: [PATCH 07/11] Update docs/pages/api.md Co-authored-by: Alex Klibisz --- docs/pages/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pages/api.md b/docs/pages/api.md index dfe1758fa..d0321e0e0 100644 --- a/docs/pages/api.md +++ b/docs/pages/api.md @@ -919,4 +919,4 @@ PUT /my-index See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details. [^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. -[^note-dot-product]: Dot product is thought to be used with normalized vectors V, meaning that ||v||==1. \ No newline at end of file +[^note-dot-product]: Dot product is intended to be used with normalized vectors V, meaning that ||v||==1. \ No newline at end of file From 32b6a8d33a1394a430352c72ae960edb5525915e Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:54:41 +0200 Subject: [PATCH 08/11] Update elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java Co-authored-by: Alex Klibisz --- .../main/java/com/klibisz/elastiknn/models/DotLshModel.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java index b30898d1d..c7106c7d4 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java @@ -22,7 +22,8 @@ public class DotLshModel implements HashingModel.DenseFloat { * @param L number of hash tables * @param k number of hash functions concatenated to form a hash for each table * @param rng random number generator used to instantiate model parameters - */ + * @param vectorOps instance of FloatVectorOps used to execute vector operations. + */ public DotLshModel(int dims, int L, int k, Random rng, FloatVectorOps vectorOps) { this.L = L; this.k = k; From 3aca421dc34f4fd3b97f1931c6160082a15d92ff Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 17:55:59 +0200 Subject: [PATCH 09/11] Addd changes to footnote --- docs/_posts/2021-07-30-how-does-elastiknn-work.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md index 30ea7a557..0af76dcb0 100644 --- a/docs/_posts/2021-07-30-how-does-elastiknn-work.md +++ b/docs/_posts/2021-07-30-how-does-elastiknn-work.md @@ -94,7 +94,8 @@ Three of these are problematic with respect to this scoring requirement. Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions, which means that higher relevance (i.e., lower distance) yields _lower_ scores. Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores. -Dot similarity is defined over $$[-1, 1]$$, and we can't have negative scores, if vectors have a magnitude of 1, then it's equivalent to cosine similarity. +Dot similarity is defined over $$[-1, 1]$$, If vectors have a magnitude of 1, then it's equivalent to cosine similarity. +Elasticsearch does not allow negative scores. To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements. The exact transformations are documented [on the API page](/api/#similarity-scoring). From b3c56762fb2902ade8c8af01185c3ef136c7faf4 Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Fri, 5 Apr 2024 18:35:57 +0200 Subject: [PATCH 10/11] dotSimilarity does not return negative floats --- .../src/main/java/com/klibisz/elastiknn/models/ExactModel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java index 39cee7d9a..b4b8e9779 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java @@ -41,6 +41,6 @@ public static double cosineSimilarity(FloatVectorOps floatVectorOps, float[] v1, @ForceInline public static double dotSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { - return 1 + floatVectorOps.dotSimilarity(v1, v2); + return Math.max(-1.0,1 + floatVectorOps.dotSimilarity(v1, v2)); } } From a75e9834a118c9fcef93cb050fe8b1a96627eb86 Mon Sep 17 00:00:00 2001 From: Joan Codina Date: Wed, 21 Aug 2024 11:46:04 +0200 Subject: [PATCH 11/11] Add some tests --- docs/pages/api.md | 4 +- .../elastiknn/api/XContentCodecSuite.scala | 83 +++++++++++++++++++ .../klibisz/elastiknn/models/ExactModel.java | 2 +- 3 files changed, 86 insertions(+), 3 deletions(-) diff --git a/docs/pages/api.md b/docs/pages/api.md index d0321e0e0..59c8e87b3 100644 --- a/docs/pages/api.md +++ b/docs/pages/api.md @@ -470,11 +470,11 @@ The exact transformations are described below. |Jaccard|N/A|0|1.0| |Hamming|N/A|0|1.0| |Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2| -|Dot[^note-dot-product]|`Dot similarity + 1`|0|2| +|Dot[^note-dot-product]|`min(2,max(0,Dot similarity + 1))`|0|2| |L1|`1 / (1 + l1 distance)`|0|1| |L2|`1 / (1 + l2 distance)`|0|1| -Dot similirarity will produce negative scores if the vectors are not normalized +Dot similirarity if the vectors are not normalized, it can produce values outside the [-1,1] range, that's why we use max,min to limit these cases If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value. diff --git a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala index 177abcfa7..2f551d5d2 100644 --- a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala +++ b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala @@ -404,6 +404,42 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { ex1.getMessage shouldBe "Expected [L] to be one of [VALUE_NUMBER] but found [VALUE_STRING]" } } + "DotLsh" - { + "roundtrip" in { + for { + _ <- 1 to 100 + (dims, l, k) = (rng.nextInt(), rng.nextInt(), rng.nextInt()) + mapping = Mapping.DotLsh(dims, l, k) + expected = Json.obj( + "type" -> "elastiknn_dense_float_vector".asJson, + "elastiknn" -> Json.obj( + "model" -> "lsh".asJson, + "dims" -> dims.asJson, + "similarity" -> "dot".asJson, + "L" -> l.asJson, + "k" -> k.asJson + ) + ) + } { + roundtrip[Mapping](expected, mapping) + } + } + "errors" in { + val ex1 = intercept[XContentParseException](decodeUnsafeFromString[Mapping](""" + |{ + | "type": "elastiknn_dense_float_vector", + | "elastiknn": { + | "model": "lsh", + | "dims": 33, + | "similarity": "dot", + | "L": "33", + | "k": 3 + | } + |} + |""".stripMargin)) + ex1.getMessage shouldBe "Expected [L] to be one of [VALUE_NUMBER] but found [VALUE_STRING]" + } + } "L2Lsh" - { "roundtrip" in { for { @@ -484,6 +520,24 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { ) def randomVec(): Vec = vecChoices(rng.nextInt(vecChoices.length))() + + val vecNormChoices = Array( + () => { + val vec = Vec.DenseFloat.random(rng.nextInt(100)) + val norm = math.sqrt(vec.data.map(x => x * x).sum).toFloat + if (norm == 0) vec else Vec.DenseFloat(vec.data.map(_ / norm)) + }, + () => Vec.Indexed(s"index${rng.nextInt()}", s"id${rng.nextInt()}", s"field${rng.nextInt()}"), + () => { + // Generate a sparse boolean vector with exactly one true value + val length = rng.nextInt(1000) + val index = rng.nextInt(length) // Random index to be set to true + Vec.SparseBool(Array(true), Array(index), length) + }, + () => Vec.Empty() + ) + + def randomNormVec(): Vec = vecChoices(rng.nextInt(vecChoices.length))() def randomSimilarity(): Similarity = Similarity.values(rng.nextInt(Similarity.values.length)) @@ -545,6 +599,35 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { ex1.getMessage shouldBe "Unable to construct [nearest neighbors query] from parsed JSON" } } + "DotLsh" - { + "roundtrip" in { + for { + _ <- 1 to 100 + vec = randomNormVec() + query = NearestNeighborsQuery.DotLsh(s"field${rng.nextInt()}", rng.nextInt(), vec) + expected = Json.obj( + "field" -> query.field.asJson, + "candidates" -> query.candidates.asJson, + "model" -> "lsh".asJson, + "similarity" -> "dot".asJson, + "vec" -> parse(XContentCodec.encodeUnsafeToString(vec)).fold(fail(_), identity) + ) + } { + roundtrip[NearestNeighborsQuery](expected, query) + } + } + "errors" in { + val ex1 = intercept[XContentParseException](decodeUnsafeFromString[NearestNeighborsQuery](""" + |{ + | "field": "vec", + | "model": "lsh", + | "similarity": "dot", + | "vec": [0, 1, 0] + |} + |""".stripMargin)) + ex1.getMessage shouldBe "Unable to construct [nearest neighbors query] from parsed JSON" + } + } "HammingLsh" - { "roundtrip" in { for { diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java index b4b8e9779..3489322ec 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java @@ -41,6 +41,6 @@ public static double cosineSimilarity(FloatVectorOps floatVectorOps, float[] v1, @ForceInline public static double dotSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { - return Math.max(-1.0,1 + floatVectorOps.dotSimilarity(v1, v2)); + return Math.min(2,Math.max(0,1 + floatVectorOps.dotSimilarity(v1, v2))); } }