Skip to content

Commit 6b6ef3c

Browse files
authored
Minor updates for v0.10.6 (#147)
* + minor improvements to ppqio and csvio * + introduced subsample into eda.describe * + add dask * + added DaskParquetIO
1 parent 1c5c2e6 commit 6b6ef3c

File tree

7 files changed

+72
-34
lines changed

7 files changed

+72
-34
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ This package **is not**:
5252
+ See `LICENCE.md` for licensing and copyright details
5353
+ See `pyproject.toml` for various package details
5454
+ This uses a logger named `'oreum_core'`, feel free to incorporate or ignore
55+
see `__init__.py` for details
5556
+ Hosting:
5657
+ Source code repo on [GitHub](https://github.com/oreum-industries/oreum_core)
5758
+ Source code release on [GitHub](https://github.com/oreum-industries/oreum_core/releases)
@@ -74,11 +75,11 @@ For local development on MacOS
7475
### 2.0 Pre-requisite installs via `homebrew`
7576

7677
1. Install Homebrew, see instructions at [https://brew.sh](https://brew.sh)
77-
2. Install `direnv`, `git`, `git-lfs`, `graphviz`, `zsh`
78+
2. Install `direnv`, `git`, `git-lfs`, `graphviz`, `tad`, `zsh`
7879

7980
```zsh
8081
$> brew update && upgrade
81-
$> brew install direnv git git-lfs graphviz zsh
82+
$> brew install direnv git git-lfs graphviz tad zsh
8283
```
8384

8485
### 2.1 Git clone the repo

assets/img/interrogate_badge.svg

Lines changed: 4 additions & 4 deletions
Loading

oreum_core.code-workspace

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"editor.unusualLineTerminators": "off",
1111
"cSpell.enabled": true,
1212
"cSpell.enabledFileTypes": {
13-
"jupyter": true
13+
"jupyter": true,
14+
"*": false,
1415
}
1516
}
1617
}

oreum_core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import logging
1818

19-
__version__ = "0.10.5"
19+
__version__ = "0.10.6"
2020

2121
# logger goes to null handler by default
2222
# packages that import oreum_core can override this and direct elsewhere

oreum_core/curate/data_io.py

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,43 +21,46 @@
2121
import subprocess
2222
from pathlib import Path
2323

24+
import dask.dataframe as dd
2425
import pandas as pd
2526

2627
from ..utils.file_io import BaseFileIO
2728

2829
__all__ = [
29-
"PandasParquetIO",
30+
"DaskParquetIO",
3031
"PandasCSVIO",
3132
"PandasExcelIO",
33+
"PandasParquetIO",
3234
"SimpleStringIO",
3335
"copy_csv2md",
3436
]
3537

3638
_log = logging.getLogger(__name__)
3739

3840

39-
class PandasParquetIO(BaseFileIO):
40-
"""Simple helper class to read/write pandas to parquet, including path and
41-
extension checking.
41+
class DaskParquetIO(BaseFileIO):
42+
"""Simple helper class to read/write dask dataframes to parquet, including
43+
path and extension checking.
4244
"""
4345

4446
def __init__(self, *args, **kwargs):
4547
"""Inherit super"""
4648
super().__init__(*args, **kwargs)
4749

48-
def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
49-
"""Read parquet fn from rootdir, pass args kwargs to pd.read_parquet"""
50+
def read(self, fn: str, *args, **kwargs) -> dd.DataFrame:
51+
"""Read parquet fn from rootdir, pass args kwargs to dask.read_parquet"""
5052
fn = Path(fn).with_suffix(".parquet")
5153
fqn = self.get_path_read(fn)
5254
_log.info(f"Read from {str(fqn.resolve())}")
53-
return pd.read_parquet(str(fqn), *args, **kwargs)
55+
return dd.read_parquet(path=fqn, *args, **kwargs)
5456

55-
def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
56-
"""Accept pandas DataFrame and fn e.g. `df.parquet`, write to fqn"""
57-
fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
58-
df.to_parquet(str(fqn), *args, **kwargs)
59-
_log.info(f"Written to {str(fqn.resolve())}")
60-
return fqn
57+
def write(self, ddf: dd.DataFrame, fn: str, *args, **kwargs) -> Path:
58+
"""Accept dask DataFrame and fn e.g. `df.parquet`, write to fqn"""
59+
raise NotImplementedError
60+
# fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
61+
# ddf.to_parquet(path=fqn, *args, **kwargs)
62+
# _log.info(f"Written to {str(fqn.resolve())}")
63+
# return fqn
6164

6265

6366
class PandasCSVIO(BaseFileIO):
@@ -74,7 +77,7 @@ def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
7477
fn = Path(fn).with_suffix(".csv")
7578
fqn = self.get_path_read(fn)
7679
_log.info(f"Read from {str(fqn.resolve())}")
77-
return pd.read_csv(str(fqn), *args, **kwargs)
80+
return pd.read_csv(fqn, *args, **kwargs)
7881

7982
def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> str:
8083
"""Accept pandas DataFrame and fn e.g. `df`, write to fn.csv
@@ -85,7 +88,7 @@ def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> str:
8588
kws.update(quoting=csv.QUOTE_NONNUMERIC)
8689
if (len(df.index.names) == 1) & (df.index.names[0] is None):
8790
kws.update(index_label="rowid")
88-
df.to_csv(str(fqn), *args, **kws)
91+
df.to_csv(fqn, *args, **kws)
8992
_log.info(f"Written to {str(fqn.resolve())}")
9093
return fqn
9194

@@ -106,12 +109,12 @@ def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
106109
fn = Path(fn).with_suffix(".xlsx")
107110
fqn = self.get_path_read(fn)
108111
_log.info(f"Read from {str(fqn.resolve())}")
109-
return pd.read_excel(str(fqn), *args, **kwargs)
112+
return pd.read_excel(fqn, *args, **kwargs)
110113

111114
def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
112115
"""Accept pandas DataFrame and fn e.g. `df.xlsx`, write to fqn."""
113116
fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".xlsx"))
114-
writer = pd.ExcelWriter(str(fqn), engine="xlsxwriter")
117+
writer = pd.ExcelWriter(fqn, engine="xlsxwriter")
115118
df.to_excel(writer, *args, **kwargs)
116119
writer.close()
117120
_log.info(f"Written to {str(fqn.resolve())}")
@@ -142,6 +145,30 @@ def writer_close(self) -> Path:
142145
return fqn
143146

144147

148+
class PandasParquetIO(BaseFileIO):
149+
"""Simple helper class to read/write pandas to parquet, including path and
150+
extension checking.
151+
"""
152+
153+
def __init__(self, *args, **kwargs):
154+
"""Inherit super"""
155+
super().__init__(*args, **kwargs)
156+
157+
def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
158+
"""Read parquet fn from rootdir, pass args kwargs to pd.read_parquet"""
159+
fn = Path(fn).with_suffix(".parquet")
160+
fqn = self.get_path_read(fn)
161+
_log.info(f"Read from {str(fqn.resolve())}")
162+
return pd.read_parquet(path=fqn, *args, **kwargs)
163+
164+
def write(self, df: pd.DataFrame, fn: str, *args, **kwargs) -> Path:
165+
"""Accept pandas DataFrame and fn e.g. `df.parquet`, write to fqn"""
166+
fqn = self.get_path_write(Path(self.snl.clean(fn)).with_suffix(".parquet"))
167+
df.to_parquet(path=fqn, *args, **kwargs)
168+
_log.info(f"Written to {str(fqn.resolve())}")
169+
return fqn
170+
171+
145172
class SimpleStringIO(BaseFileIO):
146173
"""Helper class to read/write stringlike objects to txt or json files
147174
Set kind to

oreum_core/eda/describe.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def describe(
4040
get_cr94: bool = False,
4141
reset_index: bool = True,
4242
return_df: bool = False,
43+
subsample: bool = False,
4344
**kwargs,
4445
) -> pd.DataFrame | None:
4546
"""Concat transposed topN rows, numerical desc & dtypes
@@ -48,23 +49,30 @@ def describe(
4849
Assume df has index.
4950
"""
5051

52+
df = df.copy()
5153
len_idx = df.index.nlevels
52-
note = ""
53-
if nfeats + len_idx < df.shape[1]:
54-
note = "NOTE: nfeats + index shown {} < width {}".format(
55-
nfeats + len_idx, df.shape[1]
56-
)
5754
nbytes = df.values.nbytes
5855
_log.info(f"Shape: {df.shape}")
5956
_log.info(f"Memsize: {nbytes // 1e6:,.1f} MB")
6057
_log.info(f"Index levels: {df.index.names}")
61-
_log.info(f"{note}")
58+
if nfeats + len_idx < df.shape[1]:
59+
_log.info(
60+
f"NOTE: nfeats + index shown {nfeats + len_idx}" + f" < width {df.shape[1]}"
61+
)
6262

6363
limit *= 1e6
64-
if df.values.nbytes > limit:
65-
return f"Array memsize {nbytes // 1e6:,.1f} MB > {limit / 1e6:,.1f} MB limit"
64+
if nbytes > limit:
65+
txt = (
66+
f"Array memsize {nbytes // 1e6:,.1f} MB >" + f" {limit / 1e6:,.1f} MB limit"
67+
)
68+
if subsample:
69+
df = df.sample(frac=(limit * 0.99) / nbytes, random_state=42)
70+
nobs = min(nobs, len(df))
71+
_log.info(txt + f", taking a subsample of {len(df)} rows")
72+
else:
73+
_log.error(txt)
74+
return None
6675

67-
df = df.copy()
6876
if reset_index:
6977
idx_new_names = [f"index: {c}" for c in list(df.index.names)]
7078
col_names = list(df.columns.values)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ classifiers = [
2727
]
2828
dependencies = [
2929
"csv2md>=1.1.2",
30+
"dask",
3031
"fastparquet", # not available in pandas v2.0 optional deps
3132
"ftfy>=5.4.1", # NOTE MacOS has 6.3.1
3233
"matplotlib>=3.10.0", # v3.8.0

0 commit comments

Comments
 (0)