Skip to content

Handling columns with multiple timezones #1309

@ludaavics

Description

@ludaavics

Validating a column of time zone aware datetimes fail when there are multiple time zones.

  • I have checked that this issue has not already been reported.
  • I have confirmed this bug exists on the latest version of pandera.

Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.

Code Sample, a copy-pastable example

import pandera as pa
from pandera.typing import Series
import pandas as pd


class Model(pa.SchemaModel):
    timestamp: Series[pd.DatetimeTZDtype] = pa.Field(
        dtype_kwargs={"unit": "ns", "tz": "America/Chicago"}
    )

    class Config:
        coerce = True
        strict = False


df = pd.DataFrame(
    [
        [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/Chicago")],
        [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/New_York")],
    ],
    columns=["timestamp"],
)
Model.validate(df)


SchemaErrors                              Traceback (most recent call last)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:81, in DataFrameSchemaBackend.validate(self, check_obj, schema, head, tail, sample, random_state, lazy, inplace)
     80 try:
---> 81     check_obj = parser(check_obj, *args)
     82 except SchemaError as exc:

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:532, in DataFrameSchemaBackend.coerce_dtype(self, check_obj, schema)
    529 if error_handler.collected_errors:
    530     # raise SchemaErrors if this method is called without an
    531     # error_handler
--> 532     raise SchemaErrors(
    533         schema=schema,
    534         schema_errors=error_handler.collected_errors,
    535         data=check_obj,
    536     )
    538 return check_obj

SchemaErrors: Schema Model: A total of 1 schema errors were found.

Error Counts
------------
- SchemaErrorReason.SCHEMA_COMPONENT_CHECK: 1

Schema Error Summary
--------------------
Empty DataFrame
Columns: [failure_cases, n_failure_cases]
Index: []

Usage Tip
---------

Directly inspect all errors by catching the exception:

``
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
``


The above exception was the direct cause of the following exception:

SchemaError                               Traceback (most recent call last)
Cell In[1], line 23
     13         strict = False
     16 df = pd.DataFrame(
     17     [
     18         [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/Chicago")],
   (...)
     21     columns=["timestamp"],
     22 )
---> 23 Model.validate(df)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/model.py:306, in DataFrameModel.validate(cls, check_obj, head, tail, sample, random_state, lazy, inplace)
    291 @classmethod
    292 @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)
    293 def validate(
   (...)
    301     inplace: bool = False,
    302 ) -> DataFrameBase[TDataFrameModel]:
    303     """%(validate_doc)s"""
    304     return cast(
    305         DataFrameBase[TDataFrameModel],
--> 306         cls.to_schema().validate(
    307             check_obj, head, tail, sample, random_state, lazy, inplace
    308         ),
    309     )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/container.py:366, in DataFrameSchema.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    354     check_obj = check_obj.map_partitions(  # type: ignore [operator]
    355         self._validate,
    356         head=head,
   (...)
    362         meta=check_obj,
    363     )
    364     return check_obj.pandera.add_schema(self)
--> 366 return self._validate(
    367     check_obj=check_obj,
    368     head=head,
    369     tail=tail,
    370     sample=sample,
    371     random_state=random_state,
    372     lazy=lazy,
    373     inplace=inplace,
    374 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/container.py:395, in DataFrameSchema._validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    386 if self._is_inferred:
    387     warnings.warn(
    388         f"This {type(self)} is an inferred schema that hasn't been "
    389         "modified. It's recommended that you refine the schema "
   (...)
    392         UserWarning,
    393     )
--> 395 return self.get_backend(check_obj).validate(
    396     check_obj,
    397     schema=self,
    398     head=head,
    399     tail=tail,
    400     sample=sample,
    401     random_state=random_state,
    402     lazy=lazy,
    403     inplace=inplace,
    404 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:85, in DataFrameSchemaBackend.validate(self, check_obj, schema, head, tail, sample, random_state, lazy, inplace)
     83         error_handler.collect_error(exc.reason_code, exc)
     84     except SchemaErrors as exc:
---> 85         error_handler.collect_errors(exc)
     87 # We may have modified columns, for example by
     88 # add_missing_columns, so regenerate column info
     89 column_info = self.collect_column_info(check_obj, schema)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/error_handlers.py:63, in SchemaErrorHandler.collect_errors(self, schema_errors, original_exc)
     56 """Collect schema errors from a SchemaErrors exception.
     57 
     58 :param reason_code: string representing reason for error.
     59 :param schema_error: ``SchemaError`` object.
     60 :param original_exc: original exception associated with the SchemaError.
     61 """
     62 for schema_error in schema_errors.schema_errors:
---> 63     self.collect_error(
     64         schema_error.reason_code,
     65         schema_error,
     66         original_exc or schema_errors,
     67     )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/error_handlers.py:38, in SchemaErrorHandler.collect_error(self, reason_code, schema_error, original_exc)
     31 """Collect schema error, raising exception if lazy is False.
     32 
     33 :param reason_code: string representing reason for error.
     34 :param schema_error: ``SchemaError`` object.
     35 :param original_exc: original exception associated with the SchemaError.
     36 """
     37 if not self._lazy:
---> 38     raise schema_error from original_exc
     40 # delete data of validated object from SchemaError object to prevent
     41 # storing copies of the validated DataFrame/Series for every
     42 # SchemaError collected.
     43 del schema_error.data

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:576, in DataFrameSchemaBackend._coerce_dtype_helper.<locals>._try_coercion(coerce_fn, obj)
    574 def _try_coercion(coerce_fn, obj):
    575     try:
--> 576         return coerce_fn(obj)
    577     except SchemaError as exc:
    578         error_handler.collect_error(
    579             SchemaErrorReason.DATATYPE_COERCION,
    580             exc,
    581         )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/array.py:146, in ArraySchema.coerce_dtype(self, check_obj)
    136 def coerce_dtype(
    137     self,
    138     check_obj: Union[pd.Series, pd.Index],
    139 ) -> Union[pd.Series, pd.Index]:
    140     """Coerce type of a pd.Series by type specified in dtype.
    141 
    142     :param pd.Series series: One-dimensional ndarray with axis labels
    143         (including time series).
    144     :returns: ``Series`` with coerced data type
    145     """
--> 146     return self.get_backend(check_obj).coerce_dtype(check_obj, schema=self)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/components.py:194, in ColumnBackend.coerce_dtype(self, check_obj, schema)
    190 # pylint: disable=super-with-arguments
    191 # pylint: disable=fixme
    192 # TODO: use singledispatchmethod here
    193 if is_field(check_obj) or is_index(check_obj):
--> 194     return super(ColumnBackend, self).coerce_dtype(
    195         check_obj,
    196         schema=schema,
    197     )
    198 return check_obj.apply(
    199     lambda x: super(ColumnBackend, self).coerce_dtype(
    200         x,
   (...)
    203     axis="columns",
    204 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/array.py:177, in ArraySchemaBackend.coerce_dtype(self, check_obj, schema)
    175     return schema.dtype.try_coerce(check_obj)
    176 except ParserError as exc:
--> 177     raise SchemaError(
    178         schema=schema,
    179         data=check_obj,
    180         message=(
    181             f"Error while coercing '{schema.name}' to type "
    182             f"{schema.dtype}: {exc}:\n{exc.failure_cases}"
    183         ),
    184         failure_cases=exc.failure_cases,
    185         check=f"coerce_dtype('{schema.dtype}')",
    186     ) from exc

SchemaError: Error while coercing 'timestamp' to type datetime64[ns, America/Chicago]: Could not coerce <class 'pandas.core.series.Series'> data_container into type datetime64[ns, America/Chicago]:
Empty DataFrame
Columns: [index, failure_case]
Index: []

Expected behavior

All the timezoe get converted ot the target time zone

Desktop (please complete the following information):

  • OS: ubuntu
  • Browser crome
  • Version 0.16.1

Screenshots

If applicable, add screenshots to help explain your problem.

Additional context

Add any other context about the problem here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions