diff --git a/benchmarks/spatialdata_benchmark.py b/benchmarks/spatialdata_benchmark.py index 408ad14e8..1909bb265 100644 --- a/benchmarks/spatialdata_benchmark.py +++ b/benchmarks/spatialdata_benchmark.py @@ -72,3 +72,61 @@ def time_query_polygon_box(self, length, filter_table, n_transcripts_per_cell): target_coordinate_system="global", filter_table=filter_table, ) + + +class TimeGeopandasQuery: + params = ( + [100, 1000, 10000], # TODO: test for larger number of points + ["geopandas", "dask_geopandas"], + ) + param_names = ["num_objects", "lib"] + query_size = 100 + partition_size = 100 # TODO: expose npartitions as benchmark parameter + + def setup(self, num_objects, lib): + # The point / points to query + self.query_points = self._create_random_points(self.query_size) + # Geometry + # TODO: Test clustered points (not grid), and polygons + geometry = self._create_regular_grid(num_objects=num_objects) + if lib == "geopandas": + import geopandas as gpd + from geopandas.sindex import SpatialIndex + + self.df = gpd.GeoDataFrame(geometry=geometry) + sindex: SpatialIndex = self.df.sindex + self.nearest = sindex.nearest + self.query = sindex.query + elif lib == "dask_geopandas": + import geopandas as gpd + import dask_geopandas + + gdf = gpd.GeoDataFrame(geometry=geometry) + npartitions = max(1, int(len(gdf) / self.partition_size)) + self.df = dask_geopandas.from_geopandas(gdf, npartitions=npartitions) + # TODO: Instead, save gdf to tempfile and read with dask_geopandas.read_parquet + # to test larger-than-memory datasets. + + self.nearest = self.df.sindex.nearest + self.query = self.df.sindex.query + + def _create_regular_grid(self, num_objects): + import numpy as np + from shapely.geometry import Point + + n_x = int(np.ceil(np.sqrt(num_objects))) + coordinates_x = np.linspace(0.0, 1.0, n_x) + coordinates = np.asarray(np.meshgrid(coordinates_x, coordinates_x)).T.reshape((-1, 2)) + return [Point(x, y) for y, x in coordinates[:num_objects]] + + def _create_random_points(self, num_points): + import numpy as np + from shapely.geometry import Point + + return [Point(x, y) for y, x in np.random.rand(num_points, 2)] + + def time_geopandas_nearest_point_point(self, num_objects, lib): + self.nearest(self.query_points, return_distance=True) + + def time_geopandas_query_point_point(self, num_objects, lib): + self.query(self.query_points) diff --git a/pyproject.toml b/pyproject.toml index 17d33bb16..a609d53a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ docs = [ ] benchmark = [ "asv", + "dask_geopandas<=0.4.2", ] torch = [ "torch"