Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
CLAUDE.local.md
pandera/_version.py
uv.lock
*.db
Expand Down
1 change: 1 addition & 0 deletions pandera/api/narwhals/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Narwhals API module for pandera."""
130 changes: 130 additions & 0 deletions pandera/api/narwhals/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
"""Schema components for narwhals."""

from __future__ import annotations

import logging
from typing import Any, Optional, Type

import narwhals as nw

from pandera.api.base.types import CheckList
from pandera.api.dataframe.components import ComponentSchema
from pandera.api.narwhals.types import (
NarwhalsCheckObjects,
NarwhalsDtypeInputTypes,
)
from pandera.backends.narwhals.register import register_narwhals_backends
from pandera.config import config_context, get_config_context
from pandera.engines import narwhals_engine
from pandera.utils import is_regex

logger = logging.getLogger(__name__)


class Column(ComponentSchema[NarwhalsCheckObjects]):
"""Narwhals column schema component."""

def __init__(
self,
dtype: Optional[NarwhalsDtypeInputTypes] = None,
checks: Optional[CheckList] = None,
nullable: bool = False,
unique: bool = False,
coerce: bool = False,
required: bool = True,
name: Optional[str] = None,
regex: bool = False,
title: Optional[str] = None,
description: Optional[str] = None,
default: Optional[Any] = None,
metadata: Optional[dict] = None,
drop_invalid_rows: bool = False,
**column_kwargs,
) -> None:
"""Create column validator object.

:param dtype: datatype of the column. The datatype for type-checking
a dataframe. All narwhals datatypes and supported built-in python types
that are supported by narwhals, and the pandera narwhals engine datatypes.
:param checks: checks to verify validity of the column
:param nullable: Whether or not column can contain null values.
:param unique: whether column values should be unique
:param coerce: If True, when schema.validate is called the column will
be coerced into the specified dtype.
:param required: Whether or not column is required to be present.
:param name: column name in dataframe to validate.
:param regex: whether the ``name`` field should be treated as a regex
pattern to apply to multiple columns in a dataframe.
:param title: A human-readable label for the column.
:param description: An arbitrary textual description of the column.
:param default: The default value for missing values in the column.
:param metadata: An optional key-value data.
:param drop_invalid_rows: if True, drop invalid rows on validation.
:param column_kwargs: additional keyword arguments for the column component.
"""
super().__init__(
dtype=dtype,
checks=checks,
nullable=nullable,
unique=unique,
coerce=coerce,
name=name,
title=title,
description=description,
default=default,
metadata=metadata,
drop_invalid_rows=drop_invalid_rows,
)
self.required = required
self.regex = regex
self.column_kwargs = column_kwargs

def validate(
self,
check_obj: NarwhalsCheckObjects,
head: Optional[int] = None,
tail: Optional[int] = None,
sample: Optional[int] = None,
random_state: Optional[int] = None,
lazy: bool = True,
inplace: bool = False,
) -> NarwhalsCheckObjects:
"""Validate column schema.

:param check_obj: narwhals DataFrame or LazyFrame to validate.
:param head: validate the first n rows. Rows overlapping with `tail` or
`sample` are de-duplicated.
:param tail: validate the last n rows. Rows overlapping with `head` or
`sample` are de-duplicated.
:param sample: validate a random sample of n rows. Rows overlapping
with `head` or `tail` are de-duplicated.
:param random_state: random seed for the ``sample`` argument.
:param lazy: if True, lazily evaluates dataframe against all validation
checks and raises a ``SchemaErrors``. Otherwise, raise
``SchemaError`` as soon as one occurs.
:param inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated DataFrame or LazyFrame.
"""
# Placeholder implementation - actual validation logic would go here
return check_obj

def _coerce_dtype(self, obj: NarwhalsCheckObjects) -> NarwhalsCheckObjects:
"""Coerce dataframe to specified dtype."""
# Placeholder implementation
return obj

def _check_dtype(self, obj: NarwhalsCheckObjects) -> None:
"""Check dataframe dtype."""
# Placeholder implementation
pass

def _check_nullable(self, obj: NarwhalsCheckObjects) -> None:
"""Check nullable constraint."""
# Placeholder implementation
pass

def _check_unique(self, obj: NarwhalsCheckObjects) -> None:
"""Check unique constraint."""
# Placeholder implementation
pass
90 changes: 90 additions & 0 deletions pandera/api/narwhals/container.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""DataFrame Schema for Narwhals."""

import warnings
from typing import Optional, Type

from pandera.api.dataframe.container import DataFrameSchema as _DataFrameSchema
from pandera.api.narwhals.types import NarwhalsCheckObjects, NarwhalsFrame
from pandera.api.narwhals.utils import get_validation_depth
from pandera.backends.narwhals.register import register_narwhals_backends
from pandera.config import config_context, get_config_context
from pandera.engines import narwhals_engine


class DataFrameSchema(_DataFrameSchema[NarwhalsCheckObjects]):
"""A Narwhals DataFrame or LazyFrame validator."""

def _validate_attributes(self):
super()._validate_attributes()

if self.unique_column_names:
warnings.warn(
"unique_column_names=True will have no effect on validation "
"since narwhals DataFrames do not support duplicate column "
"names."
)

if self.report_duplicates != "all":
warnings.warn(
"Setting report_duplicates to 'exclude_first' or "
"'exclude_last' will have no effect on validation. With the "
"narwhals backend, all duplicate values will be reported."
)

@staticmethod
def register_default_backends(
check_obj_cls: Type,
): # pylint: disable=unused-argument
register_narwhals_backends()

def validate(
self,
check_obj: NarwhalsFrame,
head: Optional[int] = None,
tail: Optional[int] = None,
sample: Optional[int] = None,
random_state: Optional[int] = None,
lazy: bool = False,
inplace: bool = False,
) -> NarwhalsFrame:
"""Validate a narwhals DataFrame against the schema.

:param check_obj: narwhals DataFrame or LazyFrame to validate.
:param head: validate the first n rows. Rows overlapping with `tail` or
`sample` are de-duplicated.
:param tail: validate the last n rows. Rows overlapping with `head` or
`sample` are de-duplicated.
:param sample: validate a random sample of n rows. Rows overlapping
with `head` or `tail` are de-duplicated.
:param random_state: random seed for the ``sample`` argument.
:param lazy: if True, lazily evaluates dataframe against all validation
checks and raises a ``SchemaErrors``. Otherwise, raise
``SchemaError`` as soon as one occurs.
:param inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated DataFrame or LazyFrame.
"""
# Placeholder implementation - actual validation logic would go here
return check_obj

def _subsample(
self,
check_obj: NarwhalsFrame,
head: Optional[int] = None,
tail: Optional[int] = None,
sample: Optional[int] = None,
random_state: Optional[int] = None,
) -> NarwhalsFrame:
"""Subsample dataframe for validation."""
# Placeholder implementation
return check_obj

def _coerce_dtype(self, obj: NarwhalsFrame) -> NarwhalsFrame:
"""Coerce dataframe to specified dtypes."""
# Placeholder implementation
return obj

def _check_dtype(self, obj: NarwhalsFrame) -> None:
"""Check dataframe dtypes."""
# Placeholder implementation
pass
154 changes: 154 additions & 0 deletions pandera/api/narwhals/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
"""Class-based API for Narwhals models."""

import copy
import inspect
from typing import (
Dict,
List,
Optional,
Tuple,
Type,
Union,
cast,
overload,
Any,
)

import narwhals as nw
from typing_extensions import Self

from pandera.api.base.schema import BaseSchema
from pandera.api.checks import Check
from pandera.api.dataframe.model import DataFrameModel as _DataFrameModel
from pandera.api.dataframe.model import get_dtype_kwargs
from pandera.api.dataframe.model_components import FieldInfo
from pandera.api.narwhals.components import Column
from pandera.api.narwhals.container import DataFrameSchema
from pandera.api.dataframe.model_config import BaseConfig
from pandera.api.narwhals.types import NarwhalsFrame
from pandera.engines import narwhals_engine as ne
from pandera.errors import SchemaInitError
from pandera.typing import AnnotationInfo
from pandera.typing.narwhals import DataFrame, LazyFrame, Series
from pandera.utils import docstring_substitution


class DataFrameModel(_DataFrameModel[nw.DataFrame[Any], DataFrameSchema]):
"""Model of a Narwhals :class:`~pandera.api.narwhals.container.DataFrameSchema`.

See the :ref:`User Guide <dataframe-models>` for more.
"""

Config: Type[BaseConfig] = BaseConfig # type: ignore

@classmethod
def build_schema_(cls, **kwargs) -> DataFrameSchema:
return DataFrameSchema(
cls._build_columns(cls.__fields__, cls.__checks__),
checks=cls.__root_checks__,
**kwargs,
)

@classmethod
def _build_columns( # pylint:disable=too-many-locals
cls,
fields: Dict[str, Tuple[AnnotationInfo, FieldInfo]],
checks: Dict[str, List[Check]],
) -> Dict[str, Column]:
columns: Dict[str, Column] = {}
for field_name, (annotation, field) in fields.items():
field_checks = checks.get(field_name, [])

# Placeholder implementation - would need proper narwhals dtype handling
dtype_kwargs = get_dtype_kwargs(annotation)

columns[field_name] = Column(
name=field_name,
dtype=dtype_kwargs.get("dtype"),
checks=field_checks,
nullable=field.nullable,
unique=field.unique,
coerce=field.coerce,
required=getattr(field, "required", True),
regex=field.regex,
title=field.title,
description=field.description,
default=field.default,
metadata=field.metadata,
drop_invalid_rows=getattr(field, "drop_invalid_rows", False),
)

return columns

@classmethod
def validate(
cls,
check_obj: nw.DataFrame[Any],
head: Optional[int] = None,
tail: Optional[int] = None,
sample: Optional[int] = None,
random_state: Optional[int] = None,
lazy: bool = False,
inplace: bool = False,
) -> nw.DataFrame[Any]:
"""Validate a narwhals DataFrame against the schema.

:param check_obj: narwhals DataFrame to validate.
:param head: validate the first n rows.
:param tail: validate the last n rows.
:param sample: validate a random sample of n rows.
:param random_state: random seed for the ``sample`` argument.
:param lazy: if True, lazily evaluates dataframe against all validation
checks and raises a ``SchemaErrors``. Otherwise, raise
``SchemaError`` as soon as one occurs.
:param inplace: if True, applies coercion to the object of validation,
otherwise creates a copy of the data.
:returns: validated DataFrame.
"""
# Placeholder implementation
return check_obj

@classmethod
def empty(cls, n_rows: int = 0) -> nw.DataFrame[Any]:
"""Create an empty DataFrame conforming to the schema.

:param n_rows: number of rows to create.
:returns: empty DataFrame.
"""
# Placeholder implementation
# Would need proper narwhals DataFrame creation
raise NotImplementedError(
"Empty DataFrame creation not yet implemented"
)

@classmethod
def example(cls, **kwargs: Any) -> nw.DataFrame[Any]:
"""Create an example DataFrame conforming to the schema.

:param kwargs: additional keyword arguments.
:returns: example DataFrame.
"""
# Placeholder implementation
# Would need proper narwhals DataFrame creation with example data
raise NotImplementedError(
"Example DataFrame creation not yet implemented"
)

@classmethod
def to_json_schema(cls) -> Dict[str, Any]:
"""Create a JSON schema representation of the model.

:returns: JSON schema dict.
"""
# Placeholder implementation
return {"type": "object", "properties": {}}

@classmethod
def from_json_schema(cls, json_schema: Dict[str, Any]) -> Type[Self]:
"""Create a DataFrameModel from a JSON schema.

:param json_schema: JSON schema dict.
:returns: DataFrameModel class.
"""
# Placeholder implementation
raise NotImplementedError("JSON schema loading not yet implemented")
12 changes: 12 additions & 0 deletions pandera/api/narwhals/model_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Configuration for Narwhals DataFrameModel."""

from typing import Any, Dict, Optional

from pandera.api.base.model_config import BaseModelConfig


class BaseConfig(BaseModelConfig):
"""Base configuration for Narwhals DataFrameModel."""

# Add any narwhals-specific configuration options here
pass
Loading
Loading