Skip to content

[Tracker] Version Ceiling for Openblas dependency #1967

@tarang-jain

Description

@tarang-jain

PR #1965 acts as a workaround to fix CI failures. The reason is transitive dependence on openblas (through sklearn) and we are seeing problems with > 0.3.30. We shouldn't plan to keep this ceiling for too long though because ceilings like this will make it hard for all-of-RAPIDS environments like the devcontainers to solve as time passes and we upgrade versions of other things.

Here is the log of the failing test that was fixed by the version ceiling:

=================================== FAILURES ===================================
__________________________ test_cagra_vpq_compression __________________________

    def test_cagra_vpq_compression():
        dim = 64
        pq_len = 2
>       run_cagra_build_search_test(
            n_cols=dim, compression=cagra.CompressionParams(pq_dim=dim / pq_len)
        )

tests/test_cagra.py:240: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

n_rows = 10000, n_cols = 64, n_queries = 100, k = 10
dtype = <class 'numpy.float32'>, metric = 'sqeuclidean'
intermediate_graph_degree = 128, graph_degree = 64, build_algo = 'ivf_pq'
array_type = 'device', compare = True, inplace = True, test_extend = False
search_params = SearchParams(type=CAGRA, max_queries=0, itopk_size=64, max_iterations=0, algo=100, team_size=0, search_width=1, min_it...ze=0, hashmap_mode=100, hashmap_min_bitlen=0, hashmap_max_fill_rate=0.5, num_random_samplings=1, rand_xor_mask=1213332)
compression = <cuvs.neighbors.cagra.cagra.CompressionParams object at 0xe9586c9654b0>
serialize = False

    def run_cagra_build_search_test(
        n_rows=10000,
        n_cols=10,
        n_queries=100,
        k=10,
        dtype=np.float32,
        metric="sqeuclidean",
        intermediate_graph_degree=128,
        graph_degree=64,
        build_algo="ivf_pq",
        array_type="device",
        compare=True,
        inplace=True,
        test_extend=False,
        search_params={},
        compression=None,
        serialize=False,
    ):
        dataset = generate_data((n_rows, n_cols), dtype)
        if metric == "inner_product" or metric == "cosine":
            if dtype in [np.int8, np.uint8]:
                pytest.skip("skip normalization for int8/uint8 data")
            dataset = normalize(dataset, norm="l2", axis=1)
        dataset_device = device_ndarray(dataset)
    
        build_params = cagra.IndexParams(
            metric=metric,
            intermediate_graph_degree=intermediate_graph_degree,
            graph_degree=graph_degree,
            build_algo=build_algo,
            compression=compression,
        )
    
        if test_extend:
            dataset_1 = dataset[: n_rows // 2, :]
            dataset_2 = dataset[n_rows // 2 :, :]
            extend_params = cagra.ExtendParams()
            if array_type == "device":
                dataset_1_device = device_ndarray(dataset_1)
                dataset_2_device = device_ndarray(dataset_2)
    
                index = cagra.build(build_params, dataset_1_device)
                index = cagra.extend(extend_params, index, dataset_2_device)
            else:
                index = cagra.build(build_params, dataset_1)
                index = cagra.extend(index, dataset_2)
        else:
            if array_type == "device":
                index = cagra.build(build_params, dataset_device)
            else:
                index = cagra.build(build_params, dataset)
    
        if serialize:
            with tempfile.NamedTemporaryFile(suffix=".bin", delete=False) as f:
                temp_filename = f.name
            cagra.save(temp_filename, index)
            index = cagra.load(temp_filename)
    
        queries = generate_data((n_queries, n_cols), dtype)
        out_idx = np.zeros((n_queries, k), dtype=np.uint32)
        out_dist = np.zeros((n_queries, k), dtype=np.float32)
    
        queries_device = device_ndarray(queries)
        out_idx_device = device_ndarray(out_idx) if inplace else None
        out_dist_device = device_ndarray(out_dist) if inplace else None
    
        search_params = cagra.SearchParams(**search_params)
    
        ret_output = cagra.search(
            search_params,
            index,
            queries_device,
            k,
            neighbors=out_idx_device,
            distances=out_dist_device,
        )
    
        if not inplace:
            out_dist_device, out_idx_device = ret_output
    
        if not compare:
            return
    
        out_idx = out_idx_device.copy_to_host()
        out_dist = out_dist_device.copy_to_host()
    
        # Calculate reference values with sklearn
        skl_metric = {
            "sqeuclidean": "sqeuclidean",
            "inner_product": "cosine",
            "euclidean": "euclidean",
            "cosine": "cosine",
        }[metric]
        nn_skl = NearestNeighbors(
            n_neighbors=k, algorithm="brute", metric=skl_metric
        )
        nn_skl.fit(dataset)
        skl_idx = nn_skl.kneighbors(queries, return_distance=False)
    
        recall = calc_recall(out_idx, skl_idx)
>       assert recall > 0.7
E       assert 0.001 > 0.7

tests/test_cagra.py:122: AssertionError
-------- generated xml file: /__w/cuvs/cuvs/test-results/junit-cuvs.xml --------
================================ tests coverage ================================
_______________ coverage: platform linux, python 3.12.13-final-0 _______________

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    Status

    Todo

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions