Skip to content

Commit a79ca9f

Browse files
authored
Minor updates for v0.10.7 (#148)
* + update version * + precommit autoupdate * + ensure import DaskParquetIO into curate namespace * +improve describe
1 parent 6b6ef3c commit a79ca9f

File tree

4 files changed

+23
-14
lines changed

4 files changed

+23
-14
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ repos:
3131
- id: no-print-statements
3232
files: ^oreum_core/
3333
- repo: https://github.com/astral-sh/ruff-pre-commit
34-
rev: v0.11.5
34+
rev: v0.11.7
3535
hooks:
3636
- id: ruff # lint
3737
args: [ --fix]

oreum_core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import logging
1818

19-
__version__ = "0.10.6"
19+
__version__ = "0.10.7"
2020

2121
# logger goes to null handler by default
2222
# packages that import oreum_core can override this and direct elsewhere

oreum_core/curate/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""Various classes & functions for data curation"""
1717

1818
from .data_io import (
19+
DaskParquetIO,
1920
PandasCSVIO,
2021
PandasExcelIO,
2122
PandasParquetIO,

oreum_core/eda/describe.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""Data Descriptions"""
1717

1818
import logging
19+
from copy import copy
1920

2021
import numpy as np
2122
import pandas as pd
@@ -40,7 +41,7 @@ def describe(
4041
get_cr94: bool = False,
4142
reset_index: bool = True,
4243
return_df: bool = False,
43-
subsample: bool = False,
44+
subsample: bool = True,
4445
**kwargs,
4546
) -> pd.DataFrame | None:
4647
"""Concat transposed topN rows, numerical desc & dtypes
@@ -52,12 +53,13 @@ def describe(
5253
df = df.copy()
5354
len_idx = df.index.nlevels
5455
nbytes = df.values.nbytes
55-
_log.info(f"Shape: {df.shape}")
56+
shape = df.shape
57+
_log.info(f"Shape: {shape}")
5658
_log.info(f"Memsize: {nbytes // 1e6:,.1f} MB")
5759
_log.info(f"Index levels: {df.index.names}")
58-
if nfeats + len_idx < df.shape[1]:
60+
if nfeats + len_idx < shape[1]:
5961
_log.info(
60-
f"NOTE: nfeats + index shown {nfeats + len_idx}" + f" < width {df.shape[1]}"
62+
f"NOTE: nfeats + index shown {nfeats + len_idx}" + f" < width {shape[1]}"
6163
)
6264

6365
limit *= 1e6
@@ -67,6 +69,10 @@ def describe(
6769
)
6870
if subsample:
6971
df = df.sample(frac=(limit * 0.99) / nbytes, random_state=42)
72+
nbytes_pre = copy(nbytes)
73+
shape_pre = copy(shape)
74+
nbytes = df.values.nbytes
75+
shape = df.shape
7076
nobs = min(nobs, len(df))
7177
_log.info(txt + f", taking a subsample of {len(df)} rows")
7278
else:
@@ -157,20 +163,21 @@ def describe(
157163
if return_df:
158164
return dfout
159165
else:
160-
display_fw(
161-
dfout.iloc[: nfeats + len_idx, :],
162-
max_rows=nfeats,
163-
shape=df.shape,
164-
nbytes=nbytes,
165-
**kwargs,
166-
)
166+
kws_out = dict(max_rows=nfeats, shape=shape, nbytes=nbytes)
167+
if subsample:
168+
kws_out["txtadd"] = (
169+
f"subsampled from Shape: {shape_pre},"
170+
+ f" Memsize {nbytes_pre / 1e6:,.1f} MB"
171+
)
172+
display_fw(dfout.iloc[: nfeats + len_idx, :], **kws_out, **kwargs)
167173

168174

169175
def display_fw(df: pd.DataFrame, **kwargs) -> None:
170176
"""Conv fn: contextually display max cols"""
171177

172178
shape = kwargs.pop("shape", df.shape)
173179
nbytes = kwargs.pop("nbytes", df.values.nbytes)
180+
txtadd = kwargs.pop("txtadd", None)
174181

175182
options = {
176183
"display.precision": kwargs.pop("precision", 2),
@@ -185,7 +192,8 @@ def display_fw(df: pd.DataFrame, **kwargs) -> None:
185192

186193
with pd.option_context(*[i for tup in options.items() for i in tup]):
187194
display(df)
188-
display(f"Shape: {shape}, Memsize {nbytes / 1e6:,.1f} MB")
195+
t = f"Shape: {shape}, Memsize {nbytes / 1e6:,.1f} MB"
196+
display(", ".join(filter(None, [t, txtadd])))
189197

190198

191199
def display_ht(df: pd.DataFrame, nrows=3, **kwargs) -> None:

0 commit comments

Comments
 (0)