From cff378b9e5c82d9f7cf466da75f37b1322353280 Mon Sep 17 00:00:00 2001 From: Eugenio Salvador Arellano Ruiz Date: Fri, 21 Jun 2024 17:55:02 +0200 Subject: [PATCH 01/13] Initial implementation of polars plugin --- frictionless/formats/__init__.py | 1 + frictionless/formats/pandas/plugin.py | 2 +- frictionless/formats/polars/__init__.py | 3 + .../formats/polars/__spec__/test_parser.py | 200 ++++++++++++++++++ frictionless/formats/polars/control.py | 12 ++ frictionless/formats/polars/parser.py | 142 +++++++++++++ frictionless/formats/polars/plugin.py | 41 ++++ frictionless/helpers/general.py | 2 +- frictionless/platform.py | 7 + frictionless/resources/table.py | 6 + pyproject.toml | 1 + 11 files changed, 415 insertions(+), 2 deletions(-) create mode 100644 frictionless/formats/polars/__init__.py create mode 100644 frictionless/formats/polars/__spec__/test_parser.py create mode 100644 frictionless/formats/polars/control.py create mode 100644 frictionless/formats/polars/parser.py create mode 100644 frictionless/formats/polars/plugin.py diff --git a/frictionless/formats/__init__.py b/frictionless/formats/__init__.py index 75d4902773..fa51004989 100644 --- a/frictionless/formats/__init__.py +++ b/frictionless/formats/__init__.py @@ -9,6 +9,7 @@ from .markdown import * from .ods import * from .pandas import * +from .polars import * from .parquet import * from .qsv import * from .spss import * diff --git a/frictionless/formats/pandas/plugin.py b/frictionless/formats/pandas/plugin.py index e933eb180e..267dd53f03 100644 --- a/frictionless/formats/pandas/plugin.py +++ b/frictionless/formats/pandas/plugin.py @@ -28,7 +28,7 @@ def create_parser(self, resource: Resource): def detect_resource(self, resource: Resource): if resource.data is not None: - if helpers.is_type(resource.data, "DataFrame"): + if helpers.is_type(resource.data, "pandas.core.frame.DataFrame"): resource.format = resource.format or "pandas" if resource.format == "pandas": if resource.data is None: diff --git a/frictionless/formats/polars/__init__.py b/frictionless/formats/polars/__init__.py new file mode 100644 index 0000000000..10a49142c7 --- /dev/null +++ b/frictionless/formats/polars/__init__.py @@ -0,0 +1,3 @@ +from .control import PolarsControl as PolarsControl +from .parser import PolarsParser as PolarsParser +from .plugin import PolarsPlugin as PolarsPlugin diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py new file mode 100644 index 0000000000..d7064d01b1 --- /dev/null +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -0,0 +1,200 @@ +from datetime import datetime, time +from decimal import Decimal + +import isodate +import polars as pl +import pytz +from dateutil.tz import tzoffset, tzutc +from frictionless import Package +from frictionless.resources import TableResource + +# Read + + +def test_polars_parser(): + dataframe = pl.DataFrame(data={"id": [1, 2], "name": ["english", "中国人"]}) + with TableResource(data=dataframe) as resource: + assert resource.header == ["id", "name"] + assert resource.read_rows() == [ + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_polars_parser_from_dataframe_with_datetime(): + # Polars does not have the concept of an index! + df = pl.read_csv("data/vix.csv", separator=";", try_parse_dates=True) # type: ignore + with TableResource(data=df) as resource: + # Assert meta + assert resource.schema.to_descriptor() == { + "fields": [ + {"name": "Date", "type": "datetime"}, + {"name": "VIXClose", "type": "number"}, + {"name": "VIXHigh", "type": "number"}, + {"name": "VIXLow", "type": "number"}, + {"name": "VIXOpen", "type": "number"}, + ] + } + rows = resource.read_rows() + # Assert rows + assert rows == [ + { + "Date": datetime(2004, 1, 5, tzinfo=pytz.utc), + "VIXClose": Decimal("17.49"), + "VIXHigh": Decimal("18.49"), + "VIXLow": Decimal("17.44"), + "VIXOpen": Decimal("18.45"), + }, + { + "Date": datetime(2004, 1, 6, tzinfo=pytz.utc), + "VIXClose": Decimal("16.73"), + "VIXHigh": Decimal("17.67"), + "VIXLow": Decimal("16.19"), + "VIXOpen": Decimal("17.66"), + }, + ] + + +# Write + + +def test_pandas_parser_write(): + source = TableResource(path="data/table.csv") + target = source.write(format="polars") + assert target.data.to_dicts() == [ # type: ignore + {"id": 1, "name": "english"}, + {"id": 2, "name": "中国人"}, + ] + + +def test_polars_parser_nan_in_integer_resource_column(): + # see issue 1109 + res = TableResource( + data=[ + ["int", "number", "string"], + ["1", "2.3", "string"], + ["", "4.3", "string"], + ["3", "3.14", "string"], + ] + ) + df = res.to_polars() + assert df.dtypes == [pl.Int64, pl.Float64, pl.String] # type: ignore + + +def test_pandas_parser_nan_in_integer_csv_column(): + res = TableResource(path="data/issue-1109.csv") + df = res.to_polars() + assert df.dtypes == [pl.Int64, pl.Float64, pl.String] # type: ignore + + +def test_pandas_parser_write_types(): + source = Package("data/storage/types.json").get_table_resource("types") + target = source.write(format="polars") + with target: + # Assert schema + assert target.schema.to_descriptor() == { + "fields": [ + {"name": "any", "type": "string"}, # type fallback + {"name": "array", "type": "array"}, + {"name": "boolean", "type": "boolean"}, + {"name": "date", "type": "datetime"}, # type downgrade + {"name": "date_year", "type": "datetime"}, # type downgrade/fmt removal + {"name": "datetime", "type": "datetime"}, + {"name": "duration", "type": "duration"}, + {"name": "geojson", "type": "object"}, + {"name": "geopoint", "type": "array"}, + {"name": "integer", "type": "integer"}, + {"name": "number", "type": "number"}, + {"name": "object", "type": "object"}, + {"name": "string", "type": "string"}, + {"name": "time", "type": "time"}, + {"name": "year", "type": "integer"}, # type downgrade + {"name": "yearmonth", "type": "array"}, # type downgrade + ], + } + + # Assert rows + assert target.read_rows() == [ + { + "any": "中国人", + "array": ["Mike", "John"], + "boolean": True, + "date": datetime(2015, 1, 1), + "date_year": datetime(2015, 1, 1), + "datetime": datetime(2015, 1, 1, 3, 0), + "duration": isodate.parse_duration("P1Y1M"), + "geojson": {"type": "Point", "coordinates": [33, 33.33]}, + "geopoint": [30, 70], + "integer": 1, + "number": 7, + "object": {"chars": 560}, + "string": "english", + "time": time(3, 0), + "year": 2015, + "yearmonth": [2015, 1], + }, + ] + + +def test_pandas_write_constraints(): + source = Package("data/storage/constraints.json").get_table_resource("constraints") + target = source.write(format="pandas") + with target: + # Assert schema + assert target.schema.to_descriptor() == { + "fields": [ + {"name": "required", "type": "string"}, # constraint removal + {"name": "minLength", "type": "string"}, # constraint removal + {"name": "maxLength", "type": "string"}, # constraint removal + {"name": "pattern", "type": "string"}, # constraint removal + {"name": "enum", "type": "string"}, # constraint removal + {"name": "minimum", "type": "integer"}, # constraint removal + {"name": "maximum", "type": "integer"}, # constraint removal + ], + } + + # Assert rows + assert target.read_rows() == [ + { + "required": "passing", + "minLength": "passing", + "maxLength": "passing", + "pattern": "passing", + "enum": "passing", + "minimum": 5, + "maximum": 5, + }, + ] + + +def test_pandas_parser_write_timezone(): + source = TableResource(path="data/timezone.csv") + target = source.write(format="pandas") + with target: + # Assert schema + assert target.schema.to_descriptor() == { + "fields": [ + {"name": "datetime", "type": "datetime"}, + {"name": "time", "type": "time"}, + ], + } + + # Assert rows + assert target.read_rows() == [ + { + "datetime": datetime(2020, 1, 1, 15), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), + "time": time(15, 0, tzinfo=tzutc()), + }, + { + "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzoffset(None, 10800)), + "time": time(15, 0, tzinfo=tzoffset(None, 10800)), + }, + { + "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzoffset(None, -10800)), + "time": time(15, 0, tzinfo=tzoffset(None, -10800)), + }, + ] diff --git a/frictionless/formats/polars/control.py b/frictionless/formats/polars/control.py new file mode 100644 index 0000000000..51d6a761a7 --- /dev/null +++ b/frictionless/formats/polars/control.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +import attrs + +from ...dialect import Control + + +@attrs.define(kw_only=True, repr=False) +class PolarsControl(Control): + """Polars dialect representation""" + + type = "polars" diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py new file mode 100644 index 0000000000..22b8dcf3b5 --- /dev/null +++ b/frictionless/formats/polars/parser.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import datetime +import decimal +from typing import TYPE_CHECKING, Any, List, Optional, Tuple + +from dateutil.tz import tzoffset + +from ... import types +from ...platform import platform +from ...schema import Field, Schema +from ...system import Parser + +if TYPE_CHECKING: + from ...resources import TableResource + + +class PolarsParser(Parser): + """Polars parser implementation.""" + + supported_types = [ + "array", + "boolean", + "datetime", + "date", + "duration", + "integer", + "number", + "object", + "string", + "time", + ] + + # Read + + def read_cell_stream_create(self): + pl = platform.polars + assert isinstance(self.resource.data, pl.DataFrame) + dataframe = self.resource.data + + # Schema + schema = self.__read_convert_schema() + if not self.resource.schema: + self.resource.schema = schema + + # Lists + yield schema.field_names + for row in dataframe.iter_rows(): # type: ignore + cells: List[Any] = [v if v is not pl.Null else None for v in row] + yield cells + + def __read_convert_schema(self): + pl = platform.polars + dataframe = self.resource.data + schema = Schema() + + # Fields + for name, dtype in zip(dataframe.columns, dataframe.dtypes): # type: ignore + sample = dataframe.select(pl.first(name)).item() if len(dataframe) else None # type: ignore + type = self.__read_convert_type(dtype, sample=sample) # type: ignore + field = Field.from_descriptor({"name": name, "type": type}) + schema.add_field(field) + + # Return schema + return schema + + def __read_convert_type(self, _: Any, sample: Optional[types.ISample] = None): + + # Python types + if sample is not None: + if isinstance(sample, bool): # type: ignore + return "boolean" + elif isinstance(sample, int): # type: ignore + return "integer" + elif isinstance(sample, float): # type: ignore + return "number" + if isinstance(sample, (list, tuple)): # type: ignore + return "array" + elif isinstance(sample, datetime.datetime): + return "datetime" + elif isinstance(sample, datetime.date): + return "date" + elif isinstance(sample, platform.isodate.Duration): # type: ignore + return "duration" + elif isinstance(sample, dict): + return "object" + elif isinstance(sample, str): + return "string" + elif isinstance(sample, datetime.time): + return "time" + + # Default + return "string" + + # Write + + def write_row_stream(self, source: TableResource): + pl = platform.polars + data_rows: List[Tuple[Any]] = [] + fixed_types = {} + with source: + for row in source.row_stream: + data_values: List[Any] = [] + for field in source.schema.fields: + value = row[field.name] + if isinstance(value, dict): + value = str(value) + if isinstance(value, decimal.Decimal): + value = float(value) + if isinstance(value, datetime.datetime) and value.tzinfo: + value = value.astimezone(datetime.timezone.utc) + if isinstance(value, datetime.time) and value.tzinfo: + value = value.replace( + tzinfo=tzoffset( + datetime.timezone.utc, + value.utcoffset().total_seconds(), # type: ignore + ) + ) + if value is None and field.type in ("number", "integer"): + fixed_types[field.name] = "number" + value = None + data_values.append(value) + data_rows.append(tuple(data_values)) + # Create dtypes/columns + columns: List[str] = [] + for field in source.schema.fields: + if field.name not in source.schema.primary_key: + columns.append(field.name) + + # Create/set dataframe + dataframe = pl.DataFrame(data_rows) + dataframe.columns = columns + + for field in source.schema.fields: + if ( + field.type == "integer" + and field.name in dataframe.columns + and str(dataframe.select(field.name).dtypes[0]) != "int" + ): + dataframe = dataframe.with_columns(pl.col(field.name).cast(int)) + + self.resource.data = dataframe diff --git a/frictionless/formats/polars/plugin.py b/frictionless/formats/polars/plugin.py new file mode 100644 index 0000000000..14885f4f31 --- /dev/null +++ b/frictionless/formats/polars/plugin.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +from ... import helpers +from ...platform import platform +from ...system import Plugin +from .control import PolarsControl +from .parser import PolarsParser + +if TYPE_CHECKING: + from ...resource import Resource + + +# NOTE: +# We need to ensure that the way we detect pandas dataframe is good enough. +# We don't want to be importing pandas and checking the type without a good reason + + +class PolarsPlugin(Plugin): + """Plugin for Polars""" + + # Hooks + + def create_parser(self, resource: Resource): + if resource.format == "polars": + return PolarsParser(resource) + + def detect_resource(self, resource: Resource): + if resource.data is not None: + if helpers.is_type(resource.data, "polars.dataframe.frame.DataFrame"): + resource.format = resource.format or "polars" + if resource.format == "polars": + if resource.data is None: + resource.data = platform.polars.DataFrame() + resource.datatype = resource.datatype or "table" + resource.mediatype = resource.mediatype or "application/polars" + + def select_control_class(self, type: Optional[str] = None): + if type == "polars": + return PolarsControl diff --git a/frictionless/helpers/general.py b/frictionless/helpers/general.py index 45e27ce5b7..fefe7d7723 100644 --- a/frictionless/helpers/general.py +++ b/frictionless/helpers/general.py @@ -266,7 +266,7 @@ def is_zip_descriptor(descriptor: Union[str, Dict[str, Any]]): def is_type(object: type, name: str): - return type(object).__name__ == name + return type(object).__module__ + "." + type(object).__name__ == name def parse_json_string(string: Optional[str]): diff --git a/frictionless/platform.py b/frictionless/platform.py index b88584d99d..c5fa31930d 100644 --- a/frictionless/platform.py +++ b/frictionless/platform.py @@ -325,6 +325,13 @@ def pandas(self): return pandas + @cached_property + @extras(name="polars") + def polars(self): + import polars # type: ignore + + return polars + @cached_property @extras(name="pandas") def pandas_core_dtypes_api(self): diff --git a/frictionless/resources/table.py b/frictionless/resources/table.py index 98a76f3473..de9355dfdc 100644 --- a/frictionless/resources/table.py +++ b/frictionless/resources/table.py @@ -618,6 +618,12 @@ def to_pandas(self, *, dialect: Optional[Dialect] = None): target = self.write(Resource(format="pandas", dialect=dialect)) # type: ignore return target.data + def to_polars(self, *, dialect: Optional[Dialect] = None): + """Helper to export resource as an Polars dataframe""" + dialect = dialect or Dialect() + target = self.write(Resource(format="polars", dialect=dialect)) # type: ignore + return target.data + def to_snap(self, *, json: bool = False): """Create a snapshot from the resource diff --git a/pyproject.toml b/pyproject.toml index 011a362302..7e9283ecb2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,6 +92,7 @@ html = ["pyquery>=1.4"] mysql = ["sqlalchemy>=1.4", "pymysql>=1.0"] ods = ["ezodf>=0.3", "lxml>=4.0"] pandas = ["pyarrow>=14.0", "pandas>=1.0"] +polars = ["polars>=0.20"] parquet = ["fastparquet>=0.8"] postgresql = ["sqlalchemy>=1.4", "psycopg>=3.0", "psycopg2>=2.9"] spss = ["savReaderWriter>=3.0"] From bb5da45234af0db9a3e4ca29932855fcaeeb3a81 Mon Sep 17 00:00:00 2001 From: e_arel <86229326+areleu@users.noreply.github.com> Date: Thu, 5 Sep 2024 17:28:21 +0200 Subject: [PATCH 02/13] Removed references to pandas --- .../formats/polars/__spec__/test_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py index d7064d01b1..ad26606f73 100644 --- a/frictionless/formats/polars/__spec__/test_parser.py +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -58,7 +58,7 @@ def test_polars_parser_from_dataframe_with_datetime(): # Write -def test_pandas_parser_write(): +def test_polars_parser_write(): source = TableResource(path="data/table.csv") target = source.write(format="polars") assert target.data.to_dicts() == [ # type: ignore @@ -81,13 +81,13 @@ def test_polars_parser_nan_in_integer_resource_column(): assert df.dtypes == [pl.Int64, pl.Float64, pl.String] # type: ignore -def test_pandas_parser_nan_in_integer_csv_column(): +def test_polars_parser_nan_in_integer_csv_column(): res = TableResource(path="data/issue-1109.csv") df = res.to_polars() assert df.dtypes == [pl.Int64, pl.Float64, pl.String] # type: ignore -def test_pandas_parser_write_types(): +def test_polars_parser_write_types(): source = Package("data/storage/types.json").get_table_resource("types") target = source.write(format="polars") with target: @@ -136,9 +136,9 @@ def test_pandas_parser_write_types(): ] -def test_pandas_write_constraints(): +def test_polars_write_constraints(): source = Package("data/storage/constraints.json").get_table_resource("constraints") - target = source.write(format="pandas") + target = source.write(format="polars") with target: # Assert schema assert target.schema.to_descriptor() == { @@ -167,9 +167,9 @@ def test_pandas_write_constraints(): ] -def test_pandas_parser_write_timezone(): +def test_polars_parser_write_timezone(): source = TableResource(path="data/timezone.csv") - target = source.write(format="pandas") + target = source.write(format="polars") with target: # Assert schema assert target.schema.to_descriptor() == { From 8eca3eee557bcbf80382f273f6bc1aca6999908e Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Thu, 5 Sep 2024 18:00:28 +0200 Subject: [PATCH 03/13] Sorted imports properly --- frictionless/formats/__init__.py | 2 +- frictionless/formats/polars/__spec__/test_parser.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/frictionless/formats/__init__.py b/frictionless/formats/__init__.py index fa51004989..64bcd7082d 100644 --- a/frictionless/formats/__init__.py +++ b/frictionless/formats/__init__.py @@ -9,8 +9,8 @@ from .markdown import * from .ods import * from .pandas import * -from .polars import * from .parquet import * +from .polars import * from .qsv import * from .spss import * from .sql import * diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py index ad26606f73..0799187afb 100644 --- a/frictionless/formats/polars/__spec__/test_parser.py +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -5,6 +5,7 @@ import polars as pl import pytz from dateutil.tz import tzoffset, tzutc + from frictionless import Package from frictionless.resources import TableResource From 73b16a6b7b14e9dc7039a5f4c81e53fc682a006f Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Thu, 5 Sep 2024 18:06:01 +0200 Subject: [PATCH 04/13] address pyright errors --- frictionless/formats/polars/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index 22b8dcf3b5..eb23f681c3 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -104,7 +104,7 @@ def write_row_stream(self, source: TableResource): for field in source.schema.fields: value = row[field.name] if isinstance(value, dict): - value = str(value) + value = str(value) # type: ignore if isinstance(value, decimal.Decimal): value = float(value) if isinstance(value, datetime.datetime) and value.tzinfo: From b4480c3950ff544da2b5c8b54619f0d07144c47e Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Thu, 5 Sep 2024 18:11:08 +0200 Subject: [PATCH 05/13] Add polars to test environment --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index acddfc4b21..f8d574b5e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,7 +109,7 @@ path = "frictionless/settings.py" [tool.hatch.envs.default] python = "3.10" dependencies = [ - "frictionless[aws,bigquery,ckan,csv,dev,duckdb,excel,json,github,gsheets,html,mysql,ods,pandas,parquet,postgresql,spss,sql,visidata,wkt,zenodo]", + "frictionless[aws,bigquery,ckan,csv,dev,duckdb,excel,json,github,gsheets,html,mysql,ods,pandas,polars,parquet,postgresql,spss,sql,visidata,wkt,zenodo]", ] [tool.hatch.envs.default.scripts] From 733bc35efb4b726191e6ba2a5d983a22b14e47ba Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Fri, 6 Sep 2024 10:10:02 +0200 Subject: [PATCH 06/13] Fixed parser orientation issues --- frictionless/formats/polars/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index eb23f681c3..e396bf21a4 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -128,7 +128,7 @@ def write_row_stream(self, source: TableResource): columns.append(field.name) # Create/set dataframe - dataframe = pl.DataFrame(data_rows) + dataframe = pl.DataFrame(data_rows, orient="row") dataframe.columns = columns for field in source.schema.fields: From 05811f7ffda6bfece5491bb24fc73c990b28834c Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Fri, 6 Sep 2024 11:52:59 +0200 Subject: [PATCH 07/13] Added additional check for arrays --- frictionless/formats/polars/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index e396bf21a4..231148668b 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -65,7 +65,7 @@ def __read_convert_schema(self): return schema def __read_convert_type(self, _: Any, sample: Optional[types.ISample] = None): - + pl = platform.polars # Python types if sample is not None: if isinstance(sample, bool): # type: ignore @@ -74,7 +74,7 @@ def __read_convert_type(self, _: Any, sample: Optional[types.ISample] = None): return "integer" elif isinstance(sample, float): # type: ignore return "number" - if isinstance(sample, (list, tuple)): # type: ignore + if isinstance(sample, (list, tuple, pl.Series)): # type: ignore return "array" elif isinstance(sample, datetime.datetime): return "datetime" From aa135dfd36860a9782aef61ae7d2dc45d0e1ce62 Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Fri, 29 Aug 2025 15:30:30 +0200 Subject: [PATCH 08/13] Test parser --- .../formats/polars/__spec__/test_parser.py | 88 ++++++++++--------- frictionless/formats/polars/parser.py | 15 ++-- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py index 0799187afb..ba2ce75c15 100644 --- a/frictionless/formats/polars/__spec__/test_parser.py +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -1,10 +1,10 @@ -from datetime import datetime, time +from datetime import datetime, time, date from decimal import Decimal import isodate import polars as pl import pytz -from dateutil.tz import tzoffset, tzutc +from dateutil.tz import tzutc from frictionless import Package from frictionless.resources import TableResource @@ -98,15 +98,15 @@ def test_polars_parser_write_types(): {"name": "any", "type": "string"}, # type fallback {"name": "array", "type": "array"}, {"name": "boolean", "type": "boolean"}, - {"name": "date", "type": "datetime"}, # type downgrade - {"name": "date_year", "type": "datetime"}, # type downgrade/fmt removal + {"name": "date", "type": "date"}, + {"name": "date_year", "type": "date"}, {"name": "datetime", "type": "datetime"}, {"name": "duration", "type": "duration"}, - {"name": "geojson", "type": "object"}, + {"name": "geojson", "type": "string"}, # type fallback {"name": "geopoint", "type": "array"}, {"name": "integer", "type": "integer"}, {"name": "number", "type": "number"}, - {"name": "object", "type": "object"}, + {"name": "object", "type": "string"}, # type fallback {"name": "string", "type": "string"}, {"name": "time", "type": "time"}, {"name": "year", "type": "integer"}, # type downgrade @@ -120,15 +120,15 @@ def test_polars_parser_write_types(): "any": "中国人", "array": ["Mike", "John"], "boolean": True, - "date": datetime(2015, 1, 1), - "date_year": datetime(2015, 1, 1), + "date": date(2015, 1, 1), + "date_year": date(2015, 1, 1), "datetime": datetime(2015, 1, 1, 3, 0), "duration": isodate.parse_duration("P1Y1M"), - "geojson": {"type": "Point", "coordinates": [33, 33.33]}, - "geopoint": [30, 70], + "geojson": '{"type": "Point", "coordinates": [33, 33.33]}', + "geopoint": [30.0, 70.0], "integer": 1, - "number": 7, - "object": {"chars": 560}, + "number": 7.0, + "object": '{"chars": 560}', "string": "english", "time": time(3, 0), "year": 2015, @@ -168,34 +168,36 @@ def test_polars_write_constraints(): ] -def test_polars_parser_write_timezone(): - source = TableResource(path="data/timezone.csv") - target = source.write(format="polars") - with target: - # Assert schema - assert target.schema.to_descriptor() == { - "fields": [ - {"name": "datetime", "type": "datetime"}, - {"name": "time", "type": "time"}, - ], - } - - # Assert rows - assert target.read_rows() == [ - { - "datetime": datetime(2020, 1, 1, 15), - "time": time(15), - }, - { - "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), - "time": time(15, 0, tzinfo=tzutc()), - }, - { - "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzoffset(None, 10800)), - "time": time(15, 0, tzinfo=tzoffset(None, 10800)), - }, - { - "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzoffset(None, -10800)), - "time": time(15, 0, tzinfo=tzoffset(None, -10800)), - }, - ] +# def test_polars_parser_write_timezone(): +# # This Test fails because polars does not allow mixing tz aware time with naive time. +# source = TableResource(path="data/timezone.csv") +# target = source.write(format="polars") +# with target: +# # Assert schema +# assert target.schema.to_descriptor() == { +# "fields": [ +# {"name": "datetime", "type": "datetime"}, +# {"name": "time", "type": "time"}, +# ], +# } +# # Polars disallows comparing naive tzs with explicit tzs +# # https://github.com/pola-rs/polars/pull/12966#pullrequestreview-1785291945 +# # Assert rows +# assert target.read_rows() == [ +# { +# "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), +# "time": time(15), +# }, +# { +# "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), +# "time": time(15), +# }, +# { +# "datetime": datetime(2020, 1, 1, 12, 0, tzinfo=tzutc()), +# "time": time(12), +# }, +# { +# "datetime": datetime(2020, 1, 1, 18, 0, tzinfo=tzutc()), +# "time": time(18), +# }, +# ] diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index 231148668b..f3143798b4 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -108,14 +108,15 @@ def write_row_stream(self, source: TableResource): if isinstance(value, decimal.Decimal): value = float(value) if isinstance(value, datetime.datetime) and value.tzinfo: + # Polars will only allow one timezone per column + # https://docs.pola.rs/user-guide/transformations/time-series/timezones/ value = value.astimezone(datetime.timezone.utc) - if isinstance(value, datetime.time) and value.tzinfo: - value = value.replace( - tzinfo=tzoffset( - datetime.timezone.utc, - value.utcoffset().total_seconds(), # type: ignore - ) - ) + + if isinstance(value, datetime.time) and (os := value.utcoffset()): + # offset information is lost in polars we have to resolve time. + value = ( + datetime.datetime.combine(datetime.date.today(), value) - os + ).time() if value is None and field.type in ("number", "integer"): fixed_types[field.name] = "number" value = None From 52b7c849441fcf57b1e92ded21e0bf3e3a2f4f20 Mon Sep 17 00:00:00 2001 From: Eugenio Salvador Arellano Ruiz Date: Fri, 29 Aug 2025 15:35:05 +0200 Subject: [PATCH 09/13] Reorganized imports, added skip --- .../formats/polars/__spec__/test_parser.py | 70 ++++++++++--------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py index ba2ce75c15..b1663aead9 100644 --- a/frictionless/formats/polars/__spec__/test_parser.py +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -1,8 +1,9 @@ -from datetime import datetime, time, date +from datetime import date, datetime, time from decimal import Decimal import isodate import polars as pl +import pytest import pytz from dateutil.tz import tzutc @@ -168,36 +169,37 @@ def test_polars_write_constraints(): ] -# def test_polars_parser_write_timezone(): -# # This Test fails because polars does not allow mixing tz aware time with naive time. -# source = TableResource(path="data/timezone.csv") -# target = source.write(format="polars") -# with target: -# # Assert schema -# assert target.schema.to_descriptor() == { -# "fields": [ -# {"name": "datetime", "type": "datetime"}, -# {"name": "time", "type": "time"}, -# ], -# } -# # Polars disallows comparing naive tzs with explicit tzs -# # https://github.com/pola-rs/polars/pull/12966#pullrequestreview-1785291945 -# # Assert rows -# assert target.read_rows() == [ -# { -# "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), -# "time": time(15), -# }, -# { -# "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), -# "time": time(15), -# }, -# { -# "datetime": datetime(2020, 1, 1, 12, 0, tzinfo=tzutc()), -# "time": time(12), -# }, -# { -# "datetime": datetime(2020, 1, 1, 18, 0, tzinfo=tzutc()), -# "time": time(18), -# }, -# ] +# This Test fails because polars does not allow mixing tz aware time with naive time. +@pytest.mark.skip +def test_polars_parser_write_timezone(): + source = TableResource(path="data/timezone.csv") + target = source.write(format="polars") + with target: + # Assert schema + assert target.schema.to_descriptor() == { + "fields": [ + {"name": "datetime", "type": "datetime"}, + {"name": "time", "type": "time"}, + ], + } + # Polars disallows comparing naive tzs with explicit tzs + # https://github.com/pola-rs/polars/pull/12966#pullrequestreview-1785291945 + # Assert rows + assert target.read_rows() == [ + { + "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 15, 0, tzinfo=tzutc()), + "time": time(15), + }, + { + "datetime": datetime(2020, 1, 1, 12, 0, tzinfo=tzutc()), + "time": time(12), + }, + { + "datetime": datetime(2020, 1, 1, 18, 0, tzinfo=tzutc()), + "time": time(18), + }, + ] From 58d00d864e17d1603b4bf2e2971a594aec2d21cc Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Fri, 29 Aug 2025 15:39:59 +0200 Subject: [PATCH 10/13] Removed unused import --- frictionless/formats/polars/parser.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index f3143798b4..1eb91489d3 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -4,8 +4,6 @@ import decimal from typing import TYPE_CHECKING, Any, List, Optional, Tuple -from dateutil.tz import tzoffset - from ... import types from ...platform import platform from ...schema import Field, Schema From e27cf88ec4faa029248bdc1c273035d22bb77616 Mon Sep 17 00:00:00 2001 From: Eugenio Salvador Arellano Ruiz Date: Fri, 29 Aug 2025 15:48:43 +0200 Subject: [PATCH 11/13] Switched symbols --- frictionless/formats/polars/__spec__/test_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frictionless/formats/polars/__spec__/test_parser.py b/frictionless/formats/polars/__spec__/test_parser.py index b1663aead9..8aef010438 100644 --- a/frictionless/formats/polars/__spec__/test_parser.py +++ b/frictionless/formats/polars/__spec__/test_parser.py @@ -125,11 +125,11 @@ def test_polars_parser_write_types(): "date_year": date(2015, 1, 1), "datetime": datetime(2015, 1, 1, 3, 0), "duration": isodate.parse_duration("P1Y1M"), - "geojson": '{"type": "Point", "coordinates": [33, 33.33]}', + "geojson": "{'type': 'Point', 'coordinates': [33, 33.33]}", "geopoint": [30.0, 70.0], "integer": 1, "number": 7.0, - "object": '{"chars": 560}', + "object": "{'chars': 560}", "string": "english", "time": time(3, 0), "year": 2015, From 6344292cae1304ccfb792a438150a8fe8c82e161 Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Tue, 2 Sep 2025 09:48:36 +0200 Subject: [PATCH 12/13] Use polars API to iterate schema --- frictionless/formats/polars/parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index 1eb91489d3..e465c7c0ba 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -53,7 +53,7 @@ def __read_convert_schema(self): schema = Schema() # Fields - for name, dtype in zip(dataframe.columns, dataframe.dtypes): # type: ignore + for name, dtype in dataframe.schema.items(): # type: ignore sample = dataframe.select(pl.first(name)).item() if len(dataframe) else None # type: ignore type = self.__read_convert_type(dtype, sample=sample) # type: ignore field = Field.from_descriptor({"name": name, "type": type}) From 65a2f986d0712ea8b47ec008f3bdcfa9aed9ef6d Mon Sep 17 00:00:00 2001 From: "Arellano Ruiz, Eugenio Salvador" Date: Tue, 2 Sep 2025 16:20:37 +0200 Subject: [PATCH 13/13] removed primary key filter --- frictionless/formats/polars/parser.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/frictionless/formats/polars/parser.py b/frictionless/formats/polars/parser.py index e465c7c0ba..515bfc8da3 100644 --- a/frictionless/formats/polars/parser.py +++ b/frictionless/formats/polars/parser.py @@ -123,12 +123,10 @@ def write_row_stream(self, source: TableResource): # Create dtypes/columns columns: List[str] = [] for field in source.schema.fields: - if field.name not in source.schema.primary_key: - columns.append(field.name) + columns.append(field.name) # Create/set dataframe - dataframe = pl.DataFrame(data_rows, orient="row") - dataframe.columns = columns + dataframe = pl.DataFrame(data_rows, orient="row", schema=columns) for field in source.schema.fields: if (