[GH-2331] Geopandas: Document differences of sindex compared to gpd + sindex fixes (#2332)

petern48 · web-flow · commit 33c9a3d0cf75 · 2025-09-05T14:34:05.000-07:00
diff --git a/python/sedona/spark/geopandas/geoseries.py b/python/sedona/spark/geopandas/geoseries.py
@@ -751,9 +751,7 @@ def sindex(self) -> SpatialIndex:
         if geometry_column is None:
             raise ValueError("No geometry column found in GeoSeries")
         if self._sindex is None:
-            self._sindex = SpatialIndex(
-                self._internal.spark_frame, column_name=geometry_column
-            )
+            self._sindex = SpatialIndex(self)
         return self._sindex
 
     @property
diff --git a/python/sedona/spark/geopandas/sindex.py b/python/sedona/spark/geopandas/sindex.py
@@ -38,12 +38,23 @@ def __init__(self, geometry, index_type="strtree", column_name=None):
 
         Parameters
         ----------
-        geometry : np.array of Shapely geometries, PySparkDataFrame column, or PySparkDataFrame
+        geometry : np.array of Shapely geometries, GeoSeries, or PySparkDataFrame
         index_type : str, default "strtree"
             The type of spatial index to use.
         column_name : str, optional
             The column name to extract geometry from if `geometry` is a PySparkDataFrame.
+
+        Note: query methods (ie. query, nearest, intersection) have different behaviors depending on how the index is constructed.
+        When constructed from a np.array, the query methods return indices like original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, the query methods return geometries.
         """
+        from sedona.spark.geopandas import GeoSeries
+
+        if isinstance(geometry, GeoSeries):
+            from sedona.spark.geopandas.geoseries import _get_series_col_name
+
+            column_name = _get_series_col_name(geometry)
+            geometry = geometry._internal.spark_frame
 
         if isinstance(geometry, np.ndarray):
             self.geometry = geometry
@@ -65,7 +76,7 @@ def __init__(self, geometry, index_type="strtree", column_name=None):
             self._build_spark_index(column_name)
         else:
             raise TypeError(
-                "Invalid type for `geometry`. Expected np.array or PySparkDataFrame."
+                "Invalid type for `geometry`. Expected np.array, GeoSeries, or PySparkDataFrame."
             )
 
     def query(self, geometry: BaseGeometry, predicate: str = None, sort: bool = False):
@@ -82,12 +93,18 @@ def query(self, geometry: BaseGeometry, predicate: str = None, sort: bool = Fals
         sort : bool, optional, default False
             Whether to sort the results.
 
+        Note: query() has different behaviors depending on how the index is constructed.
+        When constructed from a np.array, this method returns indices like original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method returns geometries.
+
         Note: Unlike Geopandas, Sedona does not support geometry input of type np.array or GeoSeries.
+        It is recommended to instead use GeoSeries.intersects directly.
 
         Returns
         -------
         list
-            List of indices of matching geometries.
+            List of geometries if constructed from a GeoSeries or PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
 
         if not isinstance(geometry, BaseGeometry):
@@ -96,7 +113,7 @@ def query(self, geometry: BaseGeometry, predicate: str = None, sort: bool = Fals
             )
 
         log_advice(
-            "`query` returns local list of indices of matching geometries onto driver's memory. "
+            "`query` returns a local list onto driver's memory. "
             "It should only be used if the resulting collection is expected to be small."
         )
 
@@ -170,10 +187,15 @@ def nearest(
 
         Note: Unlike Geopandas, Sedona does not support geometry input of type np.array or GeoSeries.
 
+        Note: nearest() has different behaviors depending on how the index is constructed.
+        When constructed from a np.array, this method returns indices like original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method returns geometries.
+
         Returns
         -------
         list or tuple
-            List of indices of nearest geometries, optionally with distances.
+            List of geometries if constructed from a GeoSeries or PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
 
         if not isinstance(geometry, BaseGeometry):
@@ -194,15 +216,18 @@ def nearest(
             from sedona.spark.core.spatialOperator import KNNQuery
 
             # Execute the KNN query
-            results = KNNQuery.SpatialKnnQuery(self._indexed_rdd, geometry, k, False)
+            geo_data_list = KNNQuery.SpatialKnnQuery(
+                self._indexed_rdd, geometry, k, False
+            )
+
+            # No need to keep the userData field, so convert it directly to a list of geometries
+            geoms_list = [row.geom for row in geo_data_list]
 
             if return_distance:
                 # Calculate distances if requested
-                distances = [
-                    geom.distance(geometry) for geom in [row.geom for row in results]
-                ]
-                return results, distances
-            return results
+                distances = [geom.distance(geometry) for geom in geoms_list]
+                return geoms_list, distances
+            return geoms_list
         else:
             # For local spatial index based on Shapely STRtree
             if k > len(self.geometry):
@@ -220,20 +245,29 @@ def nearest(
 
     def intersection(self, bounds):
         """
-        Find geometries that intersect the given bounding box.
+        Find geometries that intersect the given bounding box. Similar to the Geopandas version,
+        this is a compatibility wrapper for rtree.index.Index.intersection, use query instead.
 
         Parameters
         ----------
         bounds : tuple
             Bounding box as (min_x, min_y, max_x, max_y).
 
+        Note: intersection() has different behaviors depending on how the index is constructed.
+        When constructed from a np.array, this method returns indices like original geopandas.
+        When constructed from a GeoSeries or PySparkDataFrame, this method returns geometries.
+
+        Note: Unlike Geopandas, Sedona does not support geometry input of type np.array or GeoSeries.
+        It is recommended to instead use GeoSeries.intersects directly.
+
         Returns
         -------
         list
-            List of indices of matching geometries.
+            List of geometries if constructed from a GeoSeries or PySparkDataFrame.
+            List of the corresponding indices if constructed from a np.array.
         """
         log_advice(
-            "`intersection` returns local list of indices of matching geometries onto driver's memory. "
+            "`intersection` returns local list of matching geometries onto driver's memory. "
             "It should only be used if the resulting collection is expected to be small."
         )
 
@@ -246,16 +280,7 @@ def intersection(self, bounds):
         bbox = box(*bounds)
 
         if self._is_spark:
-            # For Spark-based spatial index
-            from sedona.spark.core.spatialOperator import RangeQuery
-
-            # Execute the spatial range query with the bounding box
-            result_rdd = RangeQuery.SpatialRangeQuery(
-                self._indexed_rdd, bbox, True, True
-            )
-
-            results = result_rdd.collect()
-            return results
+            return self.query(bbox, predicate="intersects")
         else:
             # For local spatial index based on Shapely STRtree
             try:
diff --git a/python/tests/geopandas/test_sindex.py b/python/tests/geopandas/test_sindex.py
@@ -19,7 +19,7 @@
 import numpy as np
 import shapely
 from pyspark.sql.functions import expr
-from shapely.geometry import Point, Polygon, LineString
+from shapely.geometry import Point, Polygon, LineString, box
 
 from tests.test_base import TestBase
 from sedona.spark.geopandas import GeoSeries
@@ -63,6 +63,31 @@ def setup_method(self):
             ]
         )
 
+    def test_construct_from_geoseries(self):
+        # Construct from a GeoSeries
+        gs = GeoSeries([Point(x, x) for x in range(5)])
+        sindex = SpatialIndex(gs)
+        result = sindex.query(Point(2, 2))
+        # SpatialIndex constructed from GeoSeries return geometries
+        assert result == [Point(2, 2)]
+
+    def test_construct_from_pyspark_dataframe(self):
+        # Construct from PySparkDataFrame
+        df = self.spark.createDataFrame(
+            [(Point(x, x),) for x in range(5)], ["geometry"]
+        )
+        sindex = SpatialIndex(df, column_name="geometry")
+        result = sindex.query(Point(2, 2))
+        assert result == [Point(2, 2)]
+
+    def test_construct_from_nparray(self):
+        # Construct from np.array
+        array = np.array([Point(x, x) for x in range(5)])
+        sindex = SpatialIndex(array)
+        result = sindex.query(Point(2, 2))
+        # Returns indices like original geopandas
+        assert result == np.array([2])
+
     def test_geoseries_sindex_property_exists(self):
         """Test that the sindex property exists on GeoSeries."""
         assert hasattr(self.points, "sindex")
@@ -182,7 +207,7 @@ def test_nearest_method(self):
         assert len(nearest_result) == 1
 
         # The nearest point should have id=2 (POINT(1 1))
-        assert nearest_result[0].geom.wkt == "POINT (1 1)"
+        assert nearest_result[0].wkt == "POINT (1 1)"
 
         # Test finding k=2 nearest neighbors
         nearest_2_results = spark_sindex.nearest(query_point, k=2)
@@ -219,7 +244,7 @@ def test_nearest_spark_with_various_geometries(self):
 
         # Should find polygon containing the point
         assert len(nearest_geom) == 1
-        assert "POLYGON" in nearest_geom[0].geom.wkt
+        assert "POLYGON" in nearest_geom[0].wkt
 
         # Test with linestring query
         query_line = LineString([(1.5, 1.5), (2.5, 2.5)])
@@ -343,7 +368,12 @@ def test_intersection_method(self):
         result_rows = spark_sindex.intersection(bounds)
 
         # Verify correct results are returned
-        assert len(result_rows) >= 2
+        expected = [
+            Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+            Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+            Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+        ]
+        assert result_rows == expected
 
         # Test with bounds that don't intersect any geometry
         empty_bounds = (10, 10, 11, 11)
@@ -353,7 +383,14 @@ def test_intersection_method(self):
         # Test with bounds that cover all geometries
         full_bounds = (-1, -1, 6, 6)
         full_results = spark_sindex.intersection(full_bounds)
-        assert len(full_results) == 5  # Should match all 5 polygons
+        expected = [
+            Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]),
+            Polygon([(1, 1), (2, 1), (2, 2), (1, 2), (1, 1)]),
+            Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]),
+            Polygon([(3, 3), (4, 3), (4, 4), (3, 4), (3, 3)]),
+            Polygon([(4, 4), (5, 4), (5, 5), (4, 5), (4, 4)]),
+        ]
+        assert full_results == expected
 
     def test_intersection_with_points(self):
         """Test the intersection method with point geometries."""
@@ -426,3 +463,11 @@ def test_intersection_with_mixed_geometries(self):
 
         # Verify results
         assert len(results) == 2
+
+    # test from the geopandas docstring
+    def test_geoseries_sindex_intersection(self):
+        gs = GeoSeries([Point(x, x) for x in range(10)])
+        result = gs.sindex.intersection(box(1, 1, 3, 3).bounds)
+        # Unlike original geopandas, this returns geometries instead of indices
+        expected = [Point(1, 1), Point(2, 2), Point(3, 3)]
+        assert result == expected