diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index dda20fe8aeb21..f880310267bc0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1154,6 +1154,8 @@ Datetimelike - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in retaining frequency in :meth:`value_counts` specifically for :meth:`DatetimeIndex` and :meth:`TimedeltaIndex` (:issue:`33830`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- Removed the special casing for sequences of Python ``date`` objects in ``DatetimeIndex.get_indexer`` and related indexing logic. + Indexing a ``DatetimeIndex`` with Python ``date`` objects now behaves consistently with other types. (:issue:`62158`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a07226ef0f50a..a44d83773be13 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -40,7 +40,6 @@ no_default, ) from pandas._libs.tslibs import ( - OutOfBoundsDatetime, Timestamp, tz_compare, ) @@ -6336,11 +6335,6 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # standardize on UTC return self.tz_convert("UTC"), other.tz_convert("UTC") - elif self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): - try: - return type(other)(self), other - except OutOfBoundsDatetime: - return self, other elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): # TODO: we dont have tests that get here return type(other)(self), other @@ -6445,6 +6439,35 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return dtype.kind == "b" elif is_numeric_dtype(self.dtype): return is_numeric_dtype(dtype) + elif isinstance(dtype, ArrowDtype): + # GH#62158 + import pyarrow as pa + + pa_dtype = dtype.pyarrow_dtype + if dtype.kind != "M": + if self.dtype.kind == "m" and pa.types.is_duration(pa_dtype): + return True + if is_string_dtype(self.dtype) and ( + pa.types.is_string(pa_dtype) or pa.types.is_large_string(pa_dtype) + ): + return True + if self.dtype.kind == "b" and pa.types.is_boolean(pa_dtype): + return True + if is_numeric_dtype(self.dtype) and ( + pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype) + ): + return True + if is_object_dtype(self.dtype): + return True + return False + if self.dtype.kind != "M": + return False + if pa.types.is_date(pa_dtype): + return False + if pa.types.is_timestamp(pa_dtype): + if (pa_dtype.tz is None) ^ (getattr(self, "tz", None) is None): + return False + return True # TODO: this was written assuming we only get here with object-dtype, # which is no longer correct. Can we specialize for EA? return True diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index fd061666c1f00..8afaf10fd9054 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,6 +8,10 @@ ABC, abstractmethod, ) +from datetime import ( + date, + datetime, +) from typing import ( TYPE_CHECKING, Any, @@ -69,7 +73,6 @@ if TYPE_CHECKING: from collections.abc import Sequence - from datetime import datetime from pandas._typing import ( Axis, @@ -525,6 +528,20 @@ def _maybe_cast_listlike_indexer(self, keyarr): """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ + # GH#62158: For DatetimeIndex, prevent matching of pure Python `date` objects + # not `datetime`. + if isinstance(self._data, DatetimeArray): + arr = ( + keyarr._values + if isinstance(keyarr, Index) + else np.asarray(keyarr, dtype=object) + ) + if any(isinstance(x, date) and not isinstance(x, datetime) for x in arr): + return ( + keyarr + if isinstance(keyarr, Index) + else Index._simple_new(arr, name=None) + ) try: res = self._data._validate_listlike(keyarr, allow_object=True) except (ValueError, TypeError): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d11c2ef8a3a3c..baf6be2c32a78 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1884,6 +1884,30 @@ def test_add_new_column_infer_string(): tm.assert_frame_equal(df, expected) +def test_datetime_indexer_consistency_pyarrow_date32(): + # GH#62158 + pytest.importorskip("pyarrow", minversion="13.0.0") + import pyarrow as pa + + ser = Series(["2016-01-01"], dtype="date32[pyarrow]") + ser3 = ser.astype("datetime64[ns]") + dti = Index(ser3) + + # Make sure we don't treat Arrow date as timestamp + dtype = ser.dtype.pyarrow_dtype + assert not (pa.types.is_timestamp(dtype) and not pa.types.is_date(dtype)) + + with pytest.raises(KeyError): + dti.get_loc(ser[0]) + + # get_indexer returns -1 for both Arrow array and object-cast + result = dti.get_indexer(ser.values) + tm.assert_numpy_array_equal(result, np.array([-1], dtype=np.intp)) + + result_obj = dti.get_indexer(ser.values.astype(object)) + tm.assert_numpy_array_equal(result_obj, np.array([-1], dtype=np.intp)) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py def _check_setitem_invalid(self, df, invalid, indexer): diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 0c91dbb01acaa..8cf9464151dfb 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -192,9 +192,10 @@ def test_asfreq_with_date_object_index(self, frame_or_series): ts2 = ts.copy() ts2.index = [x.date() for x in ts2.index] - result = ts2.asfreq("4h", method="ffill") - expected = ts.asfreq("4h", method="ffill") - tm.assert_equal(result, expected) + with pytest.raises( + TypeError, match="Cannot compare Timestamp with datetime.date" + ): + ts2.asfreq("4h", method="ffill") def test_asfreq_with_unsorted_index(self, frame_or_series): # GH#39805 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 2f7eaa2c209cd..da8f23fc34b84 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -547,10 +547,11 @@ def test_get_indexer_pyarrow(self, as_td): tm.assert_numpy_array_equal(result2, expected) def test_get_indexer_date_objs(self): + # Behavior for get_indexer with date objects changed in GH#62158. rng = date_range("1/1/2000", periods=20) result = rng.get_indexer(rng.map(lambda x: x.date())) - expected = rng.get_indexer(rng) + expected = np.full(len(rng), -1, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) def test_get_indexer(self): @@ -595,17 +596,22 @@ def test_get_indexer(self): idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") @pytest.mark.parametrize( - "target", + "target, expected", [ - [date(2020, 1, 1), Timestamp("2020-01-02")], - [Timestamp("2020-01-01"), date(2020, 1, 2)], + ( + [date(2020, 1, 1), Timestamp("2020-01-02")], + np.array([-1, 1], dtype=np.intp), + ), + ( + [Timestamp("2020-01-01"), Timestamp(date(2020, 1, 2))], + np.array([0, 1], dtype=np.intp), + ), ], ) - def test_get_indexer_mixed_dtypes(self, target): + def test_get_indexer_mixed_dtypes(self, target, expected): # https://github.com/pandas-dev/pandas/issues/33741 values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) result = values.get_indexer(target) - expected = np.array([0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index a77e55612e23d..b32dde226e7a5 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -865,6 +865,8 @@ def test_datetime_understood(self, unit): tm.assert_series_equal(result, expected) def test_align_date_objects_with_datetimeindex(self): + # GH#62158: v3.0.0 - DatetimeIndex no longer matches Python date labels. + # The result is always all-NaN and the union index. rng = date_range("1/1/2000", periods=20) ts = Series(np.random.default_rng(2).standard_normal(20), index=rng) @@ -874,10 +876,20 @@ def test_align_date_objects_with_datetimeindex(self): result = ts + ts2 result2 = ts2 + ts - expected = ts + ts[5:] - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) + + date_labels = [x.date() for x in rng[5:]] + expected_index_result = Index(list(rng) + date_labels, dtype=object) + expected_index_result2 = Index(date_labels + list(rng), dtype=object) + + # Length and index checks + assert len(result) == 35 + tm.assert_index_equal(result.index, expected_index_result) + tm.assert_index_equal(result2.index, expected_index_result2) + assert result.index.dtype == object + + # All NaN because there are no matching labels now + assert result.isna().all() + assert result2.isna().all() class TestNamePreservation: