bett bulk data tests

bgrieder · bgrieder · commit dbeb4116c566 · 2024-12-03T11:37:32.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 cachetools==5.3.3
-certifi==2024.6.2
+certifi==2024.7.4
 chardet==5.2.0
 charset-normalizer==3.3.2
 colorama==0.4.6
@@ -11,6 +11,7 @@ idna==3.7
 iniconfig==2.0.0
 jsonpath-ng==1.6.1
 leb128==1.0.8
+numpy~=2.1.2
 orjson==3.10.7
 packaging==24.1
 pandas==2.2.3
@@ -30,3 +31,4 @@ tox==4.15.1
 tzdata==2024.2
 urllib3==2.2.2
 virtualenv==20.26.2
+
diff --git a/src/module-api/src/bulk_data.py b/src/module-api/src/bulk_data.py
@@ -5,16 +5,38 @@
 
 @dataclass
 class BulkData:
+    # BulkData List of byte arrays
     data: list[bytes]
 
     def __init__(self, data: list[bytes]):
         self.data = data
 
     @staticmethod
     def is_serialized_bulk_data(serialized: bytes) -> bool:
+        """
+        Check if the serialized byte array is a serialized BulkData
+
+        A BulkData is serialized as a list of bytes with a header and a footer.
+        The header is 0x87 followed by the number of items in the list.
+        The footer is the number of items in the list and each item is a byte
+        string followed by the length of the byte string.
+
+        :param serialized: the serialized byte array
+        :return: True if the byte array is a serialized BulkData, False otherwise
+        """
         return len(serialized) > 2 and serialized[0] == 0x87 and serialized[1] == 0x87
 
     def serialize(self) -> bytes:
+        """
+        Serialize the BulkData to a byte array.
+
+        A BulkData is serialized as a list of bytes with a header and a footer.
+        The header is 0x87 followed by the number of items in the list.
+        The footer is the number of items in the list and each item is a byte
+        string followed by the length of the byte string.
+
+        :return: the serialized byte array
+        """
         result = io.BytesIO()
         # Write the header  
         result.write(bytes([0x87, 0x87]))
@@ -28,14 +50,29 @@ def serialize(self) -> bytes:
 
     @classmethod
     def deserialize(cls, serialized: bytes) -> 'BulkData':
+        """
+        Deserialize a serialized BulkData to a BulkData object
+
+        The serialized byte array is expected to be a list of bytes with a header and a footer.
+        The header is 0x87 followed by the number of items in the list.
+        The footer is the number of items in the list, and each item is a byte
+        string followed by the length of the byte string.
+
+        :param serialized: the serialized byte arrays
+        :return: the deserialized BulkData object
+        """
         data = io.BytesIO(serialized)
         # read the first two bytes 0x87
-        _header = data.read(2)
+        header = data.read(2)
+        # assert the first two bytes are 0x87
+        assert header == bytes([0x87, 0x87])
         # read the number of items
         num_items, _num_bytes = u.decode_reader(data)
         # Preallocate the list for the result
-        result = [None] * num_items
-        for i in range(num_items):
+        result: list[bytes] = [b'' for _ in range(num_items)]
+        for i in  range(num_items):
             item_length, _ = u.decode_reader(data)
-            result[i] = data.read(item_length)
+            b = data.read(item_length)
+            assert len(b) == item_length, f"Item {i} has length {len(b)}, Expected {item_length} bytes"
+            result[i] = b
         return cls(result)
diff --git a/src/module-api/src/kmip_post.py b/src/module-api/src/kmip_post.py
@@ -1,6 +1,5 @@
 import json
 import requests
-# import httpx
 from client_configuration import ClientConfiguration
 import logging
 
diff --git a/src/module-api/src/lru_cache.py b/src/module-api/src/lru_cache.py
@@ -14,6 +14,20 @@
 
 
 def key_hash(key: bytes | list[bytes]) -> int:
+    """
+    Compute the hash of the key
+
+    This function computes the hash of the key using the xxh64 algorithm.
+    The hash is computed as the digest of the concatenation of the bytes
+    in the key if the key is a list of bytes, or as the digest of the
+    key bytes if the key is a single bytes object.
+
+    Args:
+        key: the key to hash
+
+    Returns:
+        the hash of the key
+    """
     h = xxh64()
     if isinstance(key, list):
         for k in key:
@@ -22,39 +36,88 @@ def key_hash(key: bytes | list[bytes]) -> int:
         h.update(key)
     return h.intdigest()
 
-
+###
+# The LRUCache is a least recently used cache. It is used to store the result of the
+# encrypt and decrypt operations in the KMS proxy. The cache is implemented as a
+# dictionary with a limited size (the capacity). The cache is protected by a lock
+# to prevent concurrent access from multiple threads. The cache is cleared when
+# the cache size reaches the capacity.
+###    
 class LRUCache:
 
     def __init__(self, capacity):
-        self.cache = dict()
         self.capacity = capacity
-        self.access = deque()
+        self.cache = dict()
+        # The access list is used to track the order of access of the cache
+        # entries. The most recently accessed entry is at the end of the list        
+        self.access = deque(maxlen=capacity)
         self.lock = threading.Lock()
 
     def get(self, key: bytes | list[bytes]) -> bytes | None:
+        """
+        Get the value associated with the key from the cache
+
+        Args:
+            key: the key to get the value for
+
+        Returns:
+            the value associated with the key if the key is in the cache
+            None otherwise
+        """
         key = key_hash(key)
         if key not in self.cache:
+            # The key is not in the cache, return None
             return None
         else:
-            # small race condition here with the test on self.cache
-            # but we do not want to delay self.cache
+            # The key is in the cache, return the value associated with the key
+            # The access list is used to track the order of access of the cache
+            # entries. The most recently accessed entry is at the end of the list
+            # When we get an entry, we remove it from the list and add it to
+            # the end of the list
             with self.lock:
                 if self.access[-1] != key:
+                    # The key is not the most recently accessed, remove it from
+                    # the list and add it to the end of the list
                     self.access.remove(key)
                     self.access.append(key)
+                # Return the value associated with the key
                 return self.cache[key]
 
     def put(self, key: bytes | list[bytes], value: bytes):
+        """
+        Put a key/value pair in the cache
+
+        Args:
+            key: the key to put in the cache
+            value: the value to associate with the key
+
+        Notes:
+            When the cache reaches its capacity, the least recently used entry
+            is removed from the cache
+        """
         key = key_hash(key)
         with self.lock:
+            # If the key is already in the cache, remove it from the access list
             if key in self.cache:
                 self.access.remove(key)
+            # If the cache is full, remove the least recently used entry
             elif len(self.cache) == self.capacity:
                 oldest = self.access.popleft()
                 del self.cache[oldest]
+            # Put the key/value pair in the cache and add it to the end of the
+            # access list
             self.cache[key] = value
             self.access.append(key)
 
     def print(self):
-        for key in self.access:
+        """
+        Print the content of the cache
+
+        This method is useful for debugging purposes. It prints the content of
+        the cache to the console. The cache is printed as a sequence of key/value
+        pairs, with the most recently accessed entry last.
+        """
+        # Iterate over the access list in reverse order to get the most recently
+        # accessed entry last
+        for key in reversed(self.access):
             print(f"{key}: {self.cache[key]}")
diff --git a/tests/bulk_data_test.py b/tests/bulk_data_test.py
@@ -1,9 +1,10 @@
 import numpy as np
 import numpy.testing as npt
-from bulk_data import BulkData
 import logging
 import time
 import random
+import unittest
+from bulk_data import BulkData
 
 logger = logging.getLogger(__name__)
 slog = logging.LoggerAdapter(logger, {
@@ -15,22 +16,6 @@
 })
 
 
-def test_bulk_data_test_vector():
-    data = np.array([
-        bytes([0x01, 0x02, 0x03]),
-        bytes([0x04, 0x05, 0x06]),
-        bytes([0x07] * 10)
-    ])
-    bulk_data = BulkData(data)
-    serialized = bulk_data.serialize()
-    assert list(serialized) == [
-        0x87, 0x87, 0x03, 0x03, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x0A, 0x07, 0x07,
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07
-    ]
-    deserialized = BulkData.deserialize(serialized)
-    npt.assert_array_equal(data, deserialized.data)
-
-
 def benchmark_bulk_data(bulk_data) -> BulkData:
     t_start = time.perf_counter()
     serialized = bulk_data.serialize()
@@ -48,23 +33,76 @@ def random_bytes() -> bytes:
     return bytes(random.getrandbits(8) for _ in range(64))
 
 
-def test_bulk_data_benchmark():
-    num_samples = 5000000
-    slog.info(f"Testing performance with bulk data of {num_samples} samples")
-    t_start = time.perf_counter()
-    data = np.array([
-        random.randbytes(64) for _ in range(num_samples)
-    ], dtype=np.object_)
-    # check all samples have 64 bytes
-    for item in data:
-        assert len(item) == 64
-    bulk_data = BulkData(data)
-    t_generate = time.perf_counter() - t_start
-    slog.info(f"Generate: {t_generate}s")
+class TestBulkDataDeserialize(unittest.TestCase):
+    def test_valid_serialization_one_item(self):
+        serialized = b'\x87\x87\x01\x03abc'
+        expected = BulkData([b'abc'])
+        self.assertEqual(BulkData.deserialize(serialized), expected)
+
+    def test_valid_serialization_multiple_items(self):
+        serialized = b'\x87\x87\x02\x03abc\x03def'
+        expected = BulkData([b'abc', b'def'])
+        self.assertEqual(BulkData.deserialize(serialized), expected)
+
+    def test_invalid_serialization_incorrect_header(self):
+        serialized = b'\x88\x87\x01\x03abc'
+        with self.assertRaises(AssertionError):
+            BulkData.deserialize(serialized)
+
+    def test_invalid_serialization_incorrect_item_length(self):
+        serialized = b'\x87\x87\x01\x04abc'
+        with self.assertRaises(AssertionError):
+            BulkData.deserialize(serialized)
+
+    def test_invalid_serialization_truncated_data(self):
+        serialized = b'\x87\x87\x01\x03ab'
+        with self.assertRaises(AssertionError):
+            BulkData.deserialize(serialized)
+
+    def test_invalid_serialization_empty_data(self):
+        serialized = b''
+        with self.assertRaises(AssertionError):
+            BulkData.deserialize(serialized)
+
+    def test_bulk_data_test_vector(self):
+        data = np.array([
+            bytes([0x01, 0x02, 0x03]),
+            bytes([0x04, 0x05, 0x06]),
+            bytes([0x07] * 10)
+        ])
+        bulk_data = BulkData(data.tolist())
+        serialized = bulk_data.serialize()
+        assert list(serialized) == [
+            0x87, 0x87, 0x03, 0x03, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x0A, 0x07, 0x07,
+            0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07
+        ]
+        deserialized = BulkData.deserialize(serialized)
+        npt.assert_array_equal(data, deserialized.data)
+
+    def test_bulk_data_benchmark(self):
+        num_samples = 1000000
+        slog.info(f"Testing performance with bulk data of {num_samples} samples")
+        t_start = time.perf_counter()
+        data = np.array([
+            random.randbytes(64) for _ in range(num_samples)
+        ], dtype=np.object_)
+        # check all samples have 64 bytes
+        for item in data:
+            assert len(item) == 64
+        bulk_data = BulkData(data.tolist())
+        t_generate = time.perf_counter() - t_start
+        slog.info(f"Generate: {t_generate}s")
+        # serialize+deserialize
+        t_start = time.perf_counter()
+        recovered = benchmark_bulk_data(bulk_data)
+        t_all = time.perf_counter() - t_start
+        slog.info(f"serialize+deserialize: {t_all}s, i.e. {t_all / num_samples * 1000000:.6f}µs per item")
+        self.assertEqual(len(bulk_data.data), len(recovered.data))
+        # sample 100 random data from both arrays and check they are equal
+        for _ in range(100):
+            i = random.randint(0, len(bulk_data.data) - 1)
+            assert np.array_equal(bulk_data.data[i], recovered.data[i])
 
-    t_start = time.perf_counter()
-    recovered = benchmark_bulk_data(bulk_data)
-    t_all = time.perf_counter() - t_start
-    slog.info(f"serialize+deserialize: {t_all}s, i.e. {t_all / num_samples * 1000000:.6f}µs per item")
 
-    assert np.array_equal(bulk_data.data, recovered.data)
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/lru_cache_test.py b/tests/lru_cache_test.py