CNDB-15640: Determine if vectors are unit length at insert (#2059)

michaeljmarshall · michaelsembwever · commit 45b130973d6f · 2025-11-03T13:30:24.000+01:00
Fixes: riptano/cndb#15640 In order to lay the ground work for Fused ADC, I want to refactor some of the PQ/BQ logic. The unit length computation needs to move, so I decided to move it out to its own PR. The core idea is that: * some models are documented to provide unit length vectors, and in those cases, we should skip the computational check * otherwise, we should check at runtime until we hit a non-unit length vector, and then we can skip the check and configure the `writePQ` method as needed (I asked chat gpt to provide proof for the config changes proposed in this PR. Here is it's generated description.) Quick rundown of which models spit out normalized vectors (so cosine == dot product, etc.): * **OpenAI (ada-002, v3-small, v3-large)** → already normalized. [OpenAI FAQ](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) literally says embeddings are unit-length. * **BERT** → depends. The SBERT “-cos-” models add a [`Normalize` layer](https://www.sbert.net/docs/package_reference/layers.html#normalize) so they’re fine; vanilla BERT doesn’t. * **Google Gecko** → normalized out of the box per [Vertex AI docs](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings). * **NVIDIA QA-4** → nothing in the [NVIDIA NIM model card](https://docs.api.nvidia.com/nim/reference/nvidia-embed-qa-4) about normalization, so assume *not* normalized and handle it yourself. * **Cohere v3** → not explicitly in their [API docs](https://docs.cohere.com/docs/cohere-embed) TL;DR: OpenAI + Gecko are definitely safe, Cohere/BERT/NV need manual normalization due to lack of documentation.
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java b/src/java/org/apache/cassandra/index/sai/disk/vector/CassandraOnHeapGraph.java
@@ -33,7 +33,6 @@
 import java.util.function.Function;
 import java.util.function.IntUnaryOperator;
 import java.util.function.ToIntFunction;
-import java.util.stream.IntStream;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.cliffc.high_scale_lib.NonBlockingHashMap;
@@ -125,6 +124,7 @@ public enum PQVersion {
     private final InvalidVectorBehavior invalidVectorBehavior;
     private final IntHashSet deletedOrdinals;
     private volatile boolean hasDeletions;
+    private volatile boolean allVectorsAreUnitLength;
 
     // we don't need to explicitly close these since only on-heap resources are involved
     private final ThreadLocal<GraphSearcherAccessManager> searchers;
@@ -158,6 +158,8 @@ public CassandraOnHeapGraph(IndexContext context, boolean forSearching, Memtable
         invalidVectorBehavior = forSearching ? InvalidVectorBehavior.FAIL : InvalidVectorBehavior.IGNORE;
 
         int jvectorVersion = Version.current().onDiskFormat().jvectorFileFormatVersion();
+        // Assume true until we observe otherwise.
+        allVectorsAreUnitLength = true;
         // This is only a warning since it's not a fatal error to write without hierarchy
         if (indexConfig.isHierarchyEnabled() && jvectorVersion < 4)
             logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
@@ -269,6 +271,12 @@ public long add(ByteBuffer term, T key)
                 var success = postingsByOrdinal.compareAndPut(ordinal, null, postings);
                 assert success : "postingsByOrdinal already contains an entry for ordinal " + ordinal;
                 bytesUsed += builder.addGraphNode(ordinal, vector);
+
+                // If necessary, check if the vector is unit length.
+                if (!sourceModel.hasKnownUnitLengthVectors() && allVectorsAreUnitLength)
+                    if (!(Math.abs(VectorUtil.dotProduct(vector, vector) - 1.0f) < 0.01))
+                        allVectorsAreUnitLength = false;
+
                 return bytesUsed;
             }
             else
@@ -560,7 +568,6 @@ private long writePQ(SequentialWriter writer, V5VectorPostingsWriter.RemappedPos
         // Build encoder and compress vectors
         VectorCompressor<?> compressor; // will be null if we can't compress
         CompressedVectors cv = null;
-        boolean containsUnitVectors;
         // limit the PQ computation and encoding to one index at a time -- goal during flush is to
         // evict from memory ASAP so better to do the PQ build (in parallel) one at a time
         synchronized (CassandraOnHeapGraph.class)
@@ -580,15 +587,10 @@ private long writePQ(SequentialWriter writer, V5VectorPostingsWriter.RemappedPos
             // encode (compress) the vectors to save
             if (compressor != null)
                 cv = compressor.encodeAll(new RemappedVectorValues(remapped, remapped.maxNewOrdinal, vectorValues));
-
-            containsUnitVectors = IntStream.range(0, vectorValues.size())
-                                           .parallel()
-                                           .mapToObj(vectorValues::getVector)
-                                           .allMatch(v -> Math.abs(VectorUtil.dotProduct(v, v) - 1.0f) < 0.01);
         }
 
         var actualType = compressor == null ? CompressionType.NONE : preferredCompression.type;
-        writePqHeader(writer, containsUnitVectors, actualType);
+        writePqHeader(writer, allVectorsAreUnitLength, actualType);
         if (actualType == CompressionType.NONE)
             return writer.position();
 
diff --git a/src/java/org/apache/cassandra/index/sai/disk/vector/VectorSourceModel.java b/src/java/org/apache/cassandra/index/sai/disk/vector/VectorSourceModel.java
@@ -31,18 +31,19 @@
 import static org.apache.cassandra.index.sai.disk.vector.VectorCompression.CompressionType.BINARY_QUANTIZATION;
 import static org.apache.cassandra.index.sai.disk.vector.VectorCompression.CompressionType.NONE;
 import static org.apache.cassandra.index.sai.disk.vector.VectorCompression.CompressionType.PRODUCT_QUANTIZATION;
-
 public enum VectorSourceModel
 {
-    ADA002((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25),
-    OPENAI_V3_SMALL((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.5),
-    OPENAI_V3_LARGE((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.25),
-    BERT(COSINE, (dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.25), __ -> 1.0),
-    GECKO((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25),
-    NV_QA_4((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25),
-    COHERE_V3((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.25),
-
-    OTHER(COSINE, VectorSourceModel::genericCompressionFor, VectorSourceModel::genericOverquery);
+    ADA002((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25, true),
+    OPENAI_V3_SMALL((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.5, true),
+    OPENAI_V3_LARGE((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.25, true),
+    // BERT is not known to have unit length vectors in all cases
+    BERT(COSINE, (dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.25), __ -> 1.0, false),
+    GECKO((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25, true),
+    NV_QA_4((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.125), 1.25, false),
+    // Cohere does not officially say they have unit length vectors, but some users report that they do
+    COHERE_V3((dimension) -> new VectorCompression(PRODUCT_QUANTIZATION, dimension, 0.0625), 1.25, false),
+
+    OTHER(COSINE, VectorSourceModel::genericCompressionFor, VectorSourceModel::genericOverquery, false);
 
     /**
      * Default similarity function for this model.
@@ -58,18 +59,33 @@ public enum VectorSourceModel
      */
     public final Function<VectorCompression, Double> overqueryProvider;
 
-    VectorSourceModel(Function<Integer, VectorCompression> compressionProvider, double overqueryFactor)
+    /**
+     * Indicates that the model is known to have unit length vectors. When false, the runtime checks per graph
+     * until a non-unit length vector is found.
+     */
+    private final boolean knownUnitLength;
+
+    VectorSourceModel(Function<Integer, VectorCompression> compressionProvider,
+                      double overqueryFactor,
+                      boolean knownUnitLength)
     {
-        this(DOT_PRODUCT, compressionProvider, __ -> overqueryFactor);
+        this(DOT_PRODUCT, compressionProvider, __ -> overqueryFactor, knownUnitLength);
     }
 
     VectorSourceModel(VectorSimilarityFunction defaultSimilarityFunction,
                       Function<Integer, VectorCompression> compressionProvider,
-                      Function<VectorCompression, Double> overqueryProvider)
+                      Function<VectorCompression, Double> overqueryProvider,
+                      boolean knownUnitLength)
     {
         this.defaultSimilarityFunction = defaultSimilarityFunction;
         this.compressionProvider = compressionProvider;
         this.overqueryProvider = overqueryProvider;
+        this.knownUnitLength = knownUnitLength;
+    }
+
+    public boolean hasKnownUnitLengthVectors()
+    {
+        return knownUnitLength;
     }
 
     public static VectorSourceModel fromString(String value)