COH-32065 - Use numpy package to improve performance of Vector.normalize() api

dhirupandey · dhirupandey · commit c605611cc05c · 2025-04-07T10:58:20.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ pymitter = ">=0.4,<1.1"
 typing-extensions = ">=4.11,<4.14"
 types-protobuf = "5.29.1.20250403"
 pympler = "1.1"
+numpy = "2.0.2"
 
 [tool.poetry.dev-dependencies]
 pytest = "~8.3"
diff --git a/src/coherence/ai.py b/src/coherence/ai.py
@@ -1,14 +1,14 @@
-# Copyright (c) 2022, 2024, Oracle and/or its affiliates.
+# Copyright (c) 2022, 2025, Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at
 # https://oss.oracle.com/licenses/upl.
 
 from __future__ import annotations
 
 import base64
-import math
 from abc import ABC
 from collections import OrderedDict
 from typing import Any, Dict, List, Optional, TypeVar, Union, cast
+import numpy as np
 
 import jsonpickle
 
@@ -342,19 +342,8 @@ class Vectors:
     EPSILON = 1e-30  # Python automatically handles float precision
 
     @staticmethod
-    def normalize(array: List[float]) -> List[float]:
-        norm = 0.0
-        c_dim = len(array)
-
-        # Calculate the norm (sum of squares)
-        for v in array:
-            norm += v * v
-
-        # Compute the normalization factor (inverse of the square root of the sum of squares)
-        norm = 1.0 / (math.sqrt(norm) + Vectors.EPSILON)
-
-        # Apply the normalization factor to each element in the array
-        for i in range(c_dim):
-            array[i] = array[i] * norm
-
-        return array
+    def normalize_numpy(array: list[float]) -> list[float]:
+        np_array = np.array(array, dtype=np.float64)
+        norm = np.linalg.norm(np_array) + Vectors.EPSILON
+        normalized_array = np_array / norm
+        return normalized_array.tolist()
diff --git a/tests/e2e/test_ai.py b/tests/e2e/test_ai.py
@@ -54,12 +54,12 @@ async def populate_vectors(vectors: NamedCache[int, ValueWithVector]) -> ValueWi
 
     # Assign normalized vectors to the first 5 entries
     for i in range(5):
-        values[i] = ValueWithVector(FloatVector(Vectors.normalize(matches[i])), str(i), i)
+        values[i] = ValueWithVector(FloatVector(Vectors.normalize_numpy(matches[i])), str(i), i)
         await vectors.put(i, values[i])
 
     # Fill the remaining values with random vectors
     for i in range(5, count):
-        values[i] = ValueWithVector(FloatVector(Vectors.normalize(random_floats(DIMENSIONS))), str(i), i)
+        values[i] = ValueWithVector(FloatVector(Vectors.normalize_numpy(random_floats(DIMENSIONS))), str(i), i)
         await vectors.put(i, values[i])
 
     return cast(ValueWithVector, values[0])
@@ -79,13 +79,13 @@ async def populate_document_chunk_vectors(vectors: NamedCache[int, DocumentChunk
 
     # Assign normalized vectors to the first 5 entries
     for i in range(5):
-        values[i] = DocumentChunk(str(i), metadata=None, vector=FloatVector(Vectors.normalize(matches[i])))
+        values[i] = DocumentChunk(str(i), metadata=None, vector=FloatVector(Vectors.normalize_numpy(matches[i])))
         await vectors.put(i, values[i])
 
     # Fill the remaining values with random vectors
     for i in range(5, count):
         values[i] = DocumentChunk(
-            str(i), metadata=None, vector=FloatVector(Vectors.normalize(random_floats(DIMENSIONS)))
+            str(i), metadata=None, vector=FloatVector(Vectors.normalize_numpy(random_floats(DIMENSIONS)))
         )
         await vectors.put(i, values[i])