unionai-oss · cosmicBboy · Jul 16, 2025 · Jul 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+CLAUDE.local.md
 pandera/_version.py
 uv.lock
 *.db

diff --git a/pandera/api/narwhals/__init__.py b/pandera/api/narwhals/__init__.py
@@ -0,0 +1 @@
+"""Narwhals API module for pandera."""
diff --git a/pandera/api/narwhals/components.py b/pandera/api/narwhals/components.py
@@ -0,0 +1,130 @@
+"""Schema components for narwhals."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Optional, Type
+
+import narwhals as nw
+
+from pandera.api.base.types import CheckList
+from pandera.api.dataframe.components import ComponentSchema
+from pandera.api.narwhals.types import (
+    NarwhalsCheckObjects,
+    NarwhalsDtypeInputTypes,
+)
+from pandera.backends.narwhals.register import register_narwhals_backends
+from pandera.config import config_context, get_config_context
+from pandera.engines import narwhals_engine
+from pandera.utils import is_regex
+
+logger = logging.getLogger(__name__)
+
+
+class Column(ComponentSchema[NarwhalsCheckObjects]):
+    """Narwhals column schema component."""
+
+    def __init__(
+        self,
+        dtype: Optional[NarwhalsDtypeInputTypes] = None,
+        checks: Optional[CheckList] = None,
+        nullable: bool = False,
+        unique: bool = False,
+        coerce: bool = False,
+        required: bool = True,
+        name: Optional[str] = None,
+        regex: bool = False,
+        title: Optional[str] = None,
+        description: Optional[str] = None,
+        default: Optional[Any] = None,
+        metadata: Optional[dict] = None,
+        drop_invalid_rows: bool = False,
+        **column_kwargs,
+    ) -> None:
+        """Create column validator object.
+
+        :param dtype: datatype of the column. The datatype for type-checking
+            a dataframe. All narwhals datatypes and supported built-in python types
+            that are supported by narwhals, and the pandera narwhals engine datatypes.
+        :param checks: checks to verify validity of the column
+        :param nullable: Whether or not column can contain null values.
+        :param unique: whether column values should be unique
+        :param coerce: If True, when schema.validate is called the column will
+            be coerced into the specified dtype.
+        :param required: Whether or not column is required to be present.
+        :param name: column name in dataframe to validate.
+        :param regex: whether the ``name`` field should be treated as a regex
+            pattern to apply to multiple columns in a dataframe.
+        :param title: A human-readable label for the column.
+        :param description: An arbitrary textual description of the column.
+        :param default: The default value for missing values in the column.
+        :param metadata: An optional key-value data.
+        :param drop_invalid_rows: if True, drop invalid rows on validation.
+        :param column_kwargs: additional keyword arguments for the column component.
+        """
+        super().__init__(
+            dtype=dtype,
+            checks=checks,
+            nullable=nullable,
+            unique=unique,
+            coerce=coerce,
+            name=name,
+            title=title,
+            description=description,
+            default=default,
+            metadata=metadata,
+            drop_invalid_rows=drop_invalid_rows,
+        )
+        self.required = required
+        self.regex = regex
+        self.column_kwargs = column_kwargs
+
+    def validate(
+        self,
+        check_obj: NarwhalsCheckObjects,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = True,
+        inplace: bool = False,
+    ) -> NarwhalsCheckObjects:
+        """Validate column schema.
+
+        :param check_obj: narwhals DataFrame or LazyFrame to validate.
+        :param head: validate the first n rows. Rows overlapping with `tail` or
+            `sample` are de-duplicated.
+        :param tail: validate the last n rows. Rows overlapping with `head` or
+            `sample` are de-duplicated.
+        :param sample: validate a random sample of n rows. Rows overlapping
+            with `head` or `tail` are de-duplicated.
+        :param random_state: random seed for the ``sample`` argument.
+        :param lazy: if True, lazily evaluates dataframe against all validation
+            checks and raises a ``SchemaErrors``. Otherwise, raise
+            ``SchemaError`` as soon as one occurs.
+        :param inplace: if True, applies coercion to the object of validation,
+            otherwise creates a copy of the data.
+        :returns: validated DataFrame or LazyFrame.
+        """
+        # Placeholder implementation - actual validation logic would go here
+        return check_obj
+
+    def _coerce_dtype(self, obj: NarwhalsCheckObjects) -> NarwhalsCheckObjects:
+        """Coerce dataframe to specified dtype."""
+        # Placeholder implementation
+        return obj
+
+    def _check_dtype(self, obj: NarwhalsCheckObjects) -> None:
+        """Check dataframe dtype."""
+        # Placeholder implementation
+        pass
+
+    def _check_nullable(self, obj: NarwhalsCheckObjects) -> None:
+        """Check nullable constraint."""
+        # Placeholder implementation
+        pass
+
+    def _check_unique(self, obj: NarwhalsCheckObjects) -> None:
+        """Check unique constraint."""
+        # Placeholder implementation
+        pass
diff --git a/pandera/api/narwhals/container.py b/pandera/api/narwhals/container.py
@@ -0,0 +1,90 @@
+"""DataFrame Schema for Narwhals."""
+
+import warnings
+from typing import Optional, Type
+
+from pandera.api.dataframe.container import DataFrameSchema as _DataFrameSchema
+from pandera.api.narwhals.types import NarwhalsCheckObjects, NarwhalsFrame
+from pandera.api.narwhals.utils import get_validation_depth
+from pandera.backends.narwhals.register import register_narwhals_backends
+from pandera.config import config_context, get_config_context
+from pandera.engines import narwhals_engine
+
+
+class DataFrameSchema(_DataFrameSchema[NarwhalsCheckObjects]):
+    """A Narwhals DataFrame or LazyFrame validator."""
+
+    def _validate_attributes(self):
+        super()._validate_attributes()
+
+        if self.unique_column_names:
+            warnings.warn(
+                "unique_column_names=True will have no effect on validation "
+                "since narwhals DataFrames do not support duplicate column "
+                "names."
+            )
+
+        if self.report_duplicates != "all":
+            warnings.warn(
+                "Setting report_duplicates to 'exclude_first' or "
+                "'exclude_last' will have no effect on validation. With the "
+                "narwhals backend, all duplicate values will be reported."
+            )
+
+    @staticmethod
+    def register_default_backends(
+        check_obj_cls: Type,
+    ):  # pylint: disable=unused-argument
+        register_narwhals_backends()
+
+    def validate(
+        self,
+        check_obj: NarwhalsFrame,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+        inplace: bool = False,
+    ) -> NarwhalsFrame:
+        """Validate a narwhals DataFrame against the schema.
+
+        :param check_obj: narwhals DataFrame or LazyFrame to validate.
+        :param head: validate the first n rows. Rows overlapping with `tail` or
+            `sample` are de-duplicated.
+        :param tail: validate the last n rows. Rows overlapping with `head` or
+            `sample` are de-duplicated.
+        :param sample: validate a random sample of n rows. Rows overlapping
+            with `head` or `tail` are de-duplicated.
+        :param random_state: random seed for the ``sample`` argument.
+        :param lazy: if True, lazily evaluates dataframe against all validation
+            checks and raises a ``SchemaErrors``. Otherwise, raise
+            ``SchemaError`` as soon as one occurs.
+        :param inplace: if True, applies coercion to the object of validation,
+            otherwise creates a copy of the data.
+        :returns: validated DataFrame or LazyFrame.
+        """
+        # Placeholder implementation - actual validation logic would go here
+        return check_obj
+
+    def _subsample(
+        self,
+        check_obj: NarwhalsFrame,
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+    ) -> NarwhalsFrame:
+        """Subsample dataframe for validation."""
+        # Placeholder implementation
+        return check_obj
+
+    def _coerce_dtype(self, obj: NarwhalsFrame) -> NarwhalsFrame:
+        """Coerce dataframe to specified dtypes."""
+        # Placeholder implementation
+        return obj
+
+    def _check_dtype(self, obj: NarwhalsFrame) -> None:
+        """Check dataframe dtypes."""
+        # Placeholder implementation
+        pass
diff --git a/pandera/api/narwhals/model.py b/pandera/api/narwhals/model.py
@@ -0,0 +1,154 @@
+"""Class-based API for Narwhals models."""
+
+import copy
+import inspect
+from typing import (
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+    cast,
+    overload,
+    Any,
+)
+
+import narwhals as nw
+from typing_extensions import Self
+
+from pandera.api.base.schema import BaseSchema
+from pandera.api.checks import Check
+from pandera.api.dataframe.model import DataFrameModel as _DataFrameModel
+from pandera.api.dataframe.model import get_dtype_kwargs
+from pandera.api.dataframe.model_components import FieldInfo
+from pandera.api.narwhals.components import Column
+from pandera.api.narwhals.container import DataFrameSchema
+from pandera.api.dataframe.model_config import BaseConfig
+from pandera.api.narwhals.types import NarwhalsFrame
+from pandera.engines import narwhals_engine as ne
+from pandera.errors import SchemaInitError
+from pandera.typing import AnnotationInfo
+from pandera.typing.narwhals import DataFrame, LazyFrame, Series
+from pandera.utils import docstring_substitution
+
+
+class DataFrameModel(_DataFrameModel[nw.DataFrame[Any], DataFrameSchema]):
+    """Model of a Narwhals :class:`~pandera.api.narwhals.container.DataFrameSchema`.
+
+    See the :ref:`User Guide <dataframe-models>` for more.
+    """
+
+    Config: Type[BaseConfig] = BaseConfig  # type: ignore
+
+    @classmethod
+    def build_schema_(cls, **kwargs) -> DataFrameSchema:
+        return DataFrameSchema(
+            cls._build_columns(cls.__fields__, cls.__checks__),
+            checks=cls.__root_checks__,
+            **kwargs,
+        )
+
+    @classmethod
+    def _build_columns(  # pylint:disable=too-many-locals
+        cls,
+        fields: Dict[str, Tuple[AnnotationInfo, FieldInfo]],
+        checks: Dict[str, List[Check]],
+    ) -> Dict[str, Column]:
+        columns: Dict[str, Column] = {}
+        for field_name, (annotation, field) in fields.items():
+            field_checks = checks.get(field_name, [])
+
+            # Placeholder implementation - would need proper narwhals dtype handling
+            dtype_kwargs = get_dtype_kwargs(annotation)
+
+            columns[field_name] = Column(
+                name=field_name,
+                dtype=dtype_kwargs.get("dtype"),
+                checks=field_checks,
+                nullable=field.nullable,
+                unique=field.unique,
+                coerce=field.coerce,
+                required=getattr(field, "required", True),
+                regex=field.regex,
+                title=field.title,
+                description=field.description,
+                default=field.default,
+                metadata=field.metadata,
+                drop_invalid_rows=getattr(field, "drop_invalid_rows", False),
+            )
+
+        return columns
+
+    @classmethod
+    def validate(
+        cls,
+        check_obj: nw.DataFrame[Any],
+        head: Optional[int] = None,
+        tail: Optional[int] = None,
+        sample: Optional[int] = None,
+        random_state: Optional[int] = None,
+        lazy: bool = False,
+        inplace: bool = False,
+    ) -> nw.DataFrame[Any]:
+        """Validate a narwhals DataFrame against the schema.
+
+        :param check_obj: narwhals DataFrame to validate.
+        :param head: validate the first n rows.
+        :param tail: validate the last n rows.
+        :param sample: validate a random sample of n rows.
+        :param random_state: random seed for the ``sample`` argument.
+        :param lazy: if True, lazily evaluates dataframe against all validation
+            checks and raises a ``SchemaErrors``. Otherwise, raise
+            ``SchemaError`` as soon as one occurs.
+        :param inplace: if True, applies coercion to the object of validation,
+            otherwise creates a copy of the data.
+        :returns: validated DataFrame.
+        """
+        # Placeholder implementation
+        return check_obj
+
+    @classmethod
+    def empty(cls, n_rows: int = 0) -> nw.DataFrame[Any]:
+        """Create an empty DataFrame conforming to the schema.
+
+        :param n_rows: number of rows to create.
+        :returns: empty DataFrame.
+        """
+        # Placeholder implementation
+        # Would need proper narwhals DataFrame creation
+        raise NotImplementedError(
+            "Empty DataFrame creation not yet implemented"
+        )
+
+    @classmethod
+    def example(cls, **kwargs: Any) -> nw.DataFrame[Any]:
+        """Create an example DataFrame conforming to the schema.
+
+        :param kwargs: additional keyword arguments.
+        :returns: example DataFrame.
+        """
+        # Placeholder implementation
+        # Would need proper narwhals DataFrame creation with example data
+        raise NotImplementedError(
+            "Example DataFrame creation not yet implemented"
+        )
+
+    @classmethod
+    def to_json_schema(cls) -> Dict[str, Any]:
+        """Create a JSON schema representation of the model.
+
+        :returns: JSON schema dict.
+        """
+        # Placeholder implementation
+        return {"type": "object", "properties": {}}
+
+    @classmethod
+    def from_json_schema(cls, json_schema: Dict[str, Any]) -> Type[Self]:
+        """Create a DataFrameModel from a JSON schema.
+
+        :param json_schema: JSON schema dict.
+        :returns: DataFrameModel class.
+        """
+        # Placeholder implementation
+        raise NotImplementedError("JSON schema loading not yet implemented")
diff --git a/pandera/api/narwhals/model_config.py b/pandera/api/narwhals/model_config.py
@@ -0,0 +1,12 @@
+"""Configuration for Narwhals DataFrameModel."""
+
+from typing import Any, Dict, Optional
+
+from pandera.api.base.model_config import BaseModelConfig
+
+
+class BaseConfig(BaseModelConfig):
+    """Base configuration for Narwhals DataFrameModel."""
+
+    # Add any narwhals-specific configuration options here
+    pass