Minor updates for v0.10.6 (#147)

jonsedar · web-flow · commit 6b6ef3c393d9 · 2025-04-29T16:20:06.000+04:00
* + minor improvements to ppqio and csvio

* + introduced subsample into eda.describe

* + add dask

* + added DaskParquetIO
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ This package **is not**:
 + See `LICENCE.md` for licensing and copyright details
 + See `pyproject.toml` for various package details
 + This uses a logger named `'oreum_core'`, feel free to incorporate or ignore
+  see `__init__.py` for details
 + Hosting:
   + Source code repo on [GitHub](https://github.com/oreum-industries/oreum_core)
   + Source code release on [GitHub](https://github.com/oreum-industries/oreum_core/releases)
@@ -74,11 +75,11 @@ For local development on MacOS
 ### 2.0 Pre-requisite installs via `homebrew`
 
 1. Install Homebrew, see instructions at [https://brew.sh](https://brew.sh)
-2. Install `direnv`, `git`, `git-lfs`, `graphviz`, `zsh`
+2. Install `direnv`, `git`, `git-lfs`, `graphviz`, `tad`, `zsh`
 
 ```zsh
 $> brew update && upgrade
-$> brew install direnv git git-lfs graphviz zsh
+$> brew install direnv git git-lfs graphviz tad zsh
 ```
 
 ### 2.1 Git clone the repo
diff --git a/assets/img/interrogate_badge.svg b/assets/img/interrogate_badge.svg
@@ -1,5 +1,5 @@
-<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="140" height="20" role="img" aria-label="interrogate: 95.7%">
-    <title>interrogate: 95.7%</title>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="140" height="20" role="img" aria-label="interrogate: 95.8%">
+    <title>interrogate: 95.8%</title>
     <linearGradient id="s" x2="0" y2="100%">
         <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
         <stop offset="1" stop-opacity=".1"/>
@@ -15,8 +15,8 @@
     <g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110">
         <text aria-hidden="true" x="590" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="610">interrogate</text>
         <text x="590" y="140" transform="scale(.1)" fill="#fff" textLength="610">interrogate</text>
-        <text aria-hidden="true" x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370">95.7%</text>
-        <text x="1160" y="140" transform="scale(.1)" fill="#fff" textLength="370" data-interrogate="result">95.7%</text>
+        <text aria-hidden="true" x="1160" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370">95.8%</text>
+        <text x="1160" y="140" transform="scale(.1)" fill="#fff" textLength="370" data-interrogate="result">95.8%</text>
     </g>
     <g id="logo-shadow" transform="matrix(0.854876,0,0,0.854876,-6.73514,1.732)">
         <g transform="matrix(0.299012,0,0,0.299012,9.70229,-6.68582)">
diff --git a/oreum_core.code-workspace b/oreum_core.code-workspace
@@ -10,7 +10,8 @@
 		"editor.unusualLineTerminators": "off",
 		"cSpell.enabled": true,
 		"cSpell.enabledFileTypes": {
-			"jupyter": true
+			"jupyter": true,
+			"*": false,
 		}
 	}
 }
diff --git a/oreum_core/__init__.py b/oreum_core/__init__.py
@@ -16,7 +16,7 @@
 
 import logging
 
-__version__ = "0.10.5"
+__version__ = "0.10.6"
 
 # logger goes to null handler by default
 # packages that import oreum_core can override this and direct elsewhere
diff --git a/oreum_core/curate/data_io.py b/oreum_core/curate/data_io.py
@@ -21,43 +21,46 @@
 import subprocess
 from pathlib import Path
 
+import dask.dataframe as dd
 import pandas as pd
 
 from ..utils.file_io import BaseFileIO
 
 __all__ = [
-    "PandasParquetIO",
+    "DaskParquetIO",
     "PandasCSVIO",
     "PandasExcelIO",
+    "PandasParquetIO",
     "SimpleStringIO",
     "copy_csv2md",
 ]
 
 _log = logging.getLogger(__name__)
 
 
-class PandasParquetIO(BaseFileIO):
-    """Simple helper class to read/write pandas to parquet, including path and
-    extension checking.
+class DaskParquetIO(BaseFileIO):
+    """Simple helper class to read/write dask dataframes to parquet, including
+    path and extension checking.
     """
 
     def __init__(self, *args, **kwargs):
         """Inherit super"""
         super().__init__(*args, **kwargs)
 
-    def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
-        """Read parquet fn from rootdir, pass args kwargs to pd.read_parquet"""
+    def read(self, fn: str, *args, **kwargs) -> dd.DataFrame:
+        """Read parquet fn from rootdir, pass args kwargs to dask.read_parquet"""
         fn = Path(fn).with_suffix(".parquet")
         fqn = self.get_path_read(fn)
         _log.info(f"Read from {str(fqn.resolve())}")
-        return pd.read_parquet(str(fqn), *args, **kwargs)
+        return dd.read_parquet(path=fqn, *args, **kwargs)
 
-    def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
-        """Accept pandas DataFrame and fn e.g. `df.parquet`, write to fqn"""
-        fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
-        df.to_parquet(str(fqn), *args, **kwargs)
-        _log.info(f"Written to {str(fqn.resolve())}")
-        return fqn
+    def write(self, ddf: dd.DataFrame, fn: str, *args, **kwargs) -> Path:
+        """Accept dask DataFrame and fn e.g. `df.parquet`, write to fqn"""
+        raise NotImplementedError
+        # fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
+        # ddf.to_parquet(path=fqn, *args, **kwargs)
+        # _log.info(f"Written to {str(fqn.resolve())}")
+        # return fqn
 
 
 class PandasCSVIO(BaseFileIO):
@@ -74,7 +77,7 @@ def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
         fn = Path(fn).with_suffix(".csv")
         fqn = self.get_path_read(fn)
         _log.info(f"Read from {str(fqn.resolve())}")
-        return pd.read_csv(str(fqn), *args, **kwargs)
+        return pd.read_csv(fqn, *args, **kwargs)
 
     def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> str:
         """Accept pandas DataFrame and fn e.g. `df`, write to fn.csv
@@ -85,7 +88,7 @@ def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> str:
         kws.update(quoting=csv.QUOTE_NONNUMERIC)
         if (len(df.index.names) == 1) & (df.index.names[0] is None):
             kws.update(index_label="rowid")
-        df.to_csv(str(fqn), *args, **kws)
+        df.to_csv(fqn, *args, **kws)
         _log.info(f"Written to {str(fqn.resolve())}")
         return fqn
 
@@ -106,12 +109,12 @@ def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
         fn = Path(fn).with_suffix(".xlsx")
         fqn = self.get_path_read(fn)
         _log.info(f"Read from {str(fqn.resolve())}")
-        return pd.read_excel(str(fqn), *args, **kwargs)
+        return pd.read_excel(fqn, *args, **kwargs)
 
     def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
         """Accept pandas DataFrame and fn e.g. `df.xlsx`, write to fqn."""
         fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".xlsx"))
-        writer = pd.ExcelWriter(str(fqn), engine="xlsxwriter")
+        writer = pd.ExcelWriter(fqn, engine="xlsxwriter")
         df.to_excel(writer, *args, **kwargs)
         writer.close()
         _log.info(f"Written to {str(fqn.resolve())}")
@@ -142,6 +145,30 @@ def writer_close(self) -> Path:
             return fqn
 
 
+class PandasParquetIO(BaseFileIO):
+    """Simple helper class to read/write pandas to parquet, including path and
+    extension checking.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """Inherit super"""
+        super().__init__(*args, **kwargs)
+
+    def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
+        """Read parquet fn from rootdir, pass args kwargs to pd.read_parquet"""
+        fn = Path(fn).with_suffix(".parquet")
+        fqn = self.get_path_read(fn)
+        _log.info(f"Read from {str(fqn.resolve())}")
+        return pd.read_parquet(path=fqn, *args, **kwargs)
+
+    def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
+        """Accept pandas DataFrame and fn e.g. `df.parquet`, write to fqn"""
+        fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
+        df.to_parquet(path=fqn, *args, **kwargs)
+        _log.info(f"Written to {str(fqn.resolve())}")
+        return fqn
+
+
 class SimpleStringIO(BaseFileIO):
     """Helper class to read/write stringlike objects to txt or json files
     Set kind to
diff --git a/oreum_core/eda/describe.py b/oreum_core/eda/describe.py
@@ -40,6 +40,7 @@ def describe(
     get_cr94: bool = False,
     reset_index: bool = True,
     return_df: bool = False,
+    subsample: bool = False,
     **kwargs,
 ) -> pd.DataFrame | None:
     """Concat transposed topN rows, numerical desc & dtypes
@@ -48,23 +49,30 @@ def describe(
     Assume df has index.
     """
 
+    df = df.copy()
     len_idx = df.index.nlevels
-    note = ""
-    if nfeats + len_idx < df.shape[1]:
-        note = "NOTE: nfeats + index shown {} < width {}".format(
-            nfeats + len_idx, df.shape[1]
-        )
     nbytes = df.values.nbytes
     _log.info(f"Shape: {df.shape}")
     _log.info(f"Memsize: {nbytes // 1e6:,.1f} MB")
     _log.info(f"Index levels: {df.index.names}")
-    _log.info(f"{note}")
+    if nfeats + len_idx < df.shape[1]:
+        _log.info(
+            f"NOTE: nfeats + index shown {nfeats + len_idx}" + f" < width {df.shape[1]}"
+        )
 
     limit *= 1e6
-    if df.values.nbytes > limit:
-        return f"Array memsize {nbytes // 1e6:,.1f} MB > {limit / 1e6:,.1f} MB limit"
+    if nbytes > limit:
+        txt = (
+            f"Array memsize {nbytes // 1e6:,.1f} MB >" + f" {limit / 1e6:,.1f} MB limit"
+        )
+        if subsample:
+            df = df.sample(frac=(limit * 0.99) / nbytes, random_state=42)
+            nobs = min(nobs, len(df))
+            _log.info(txt + f", taking a subsample of {len(df)} rows")
+        else:
+            _log.error(txt)
+            return None
 
-    df = df.copy()
     if reset_index:
         idx_new_names = [f"index: {c}" for c in list(df.index.names)]
         col_names = list(df.columns.values)
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,7 @@ classifiers = [
 ]
 dependencies = [
     "csv2md>=1.1.2",
+    "dask",
     "fastparquet",  # not available in pandas v2.0 optional deps
     "ftfy>=5.4.1",  # NOTE MacOS has 6.3.1
     "matplotlib>=3.10.0",  # v3.8.0

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,8 @@`
`10`	`10`	`"editor.unusualLineTerminators": "off",`
`11`	`11`	`"cSpell.enabled": true,`
`12`	`12`	`"cSpell.enabledFileTypes": {`
`13`		`- "jupyter": true`
	`13`	`+ "jupyter": true,`
	`14`	`+ "*": false,`
`14`	`15`	`}`
`15`	`16`	`}`
`16`	`17`	`}`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ classifiers = [`
`27`	`27`	`]`
`28`	`28`	`dependencies = [`
`29`	`29`	`"csv2md>=1.1.2",`
	`30`	`+ "dask",`
`30`	`31`	`"fastparquet", # not available in pandas v2.0 optional deps`
`31`	`32`	`"ftfy>=5.4.1", # NOTE MacOS has 6.3.1`
`32`	`33`	`"matplotlib>=3.10.0", # v3.8.0`