Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901
ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901, E712
max-line-length = 88
max-doc-length = 144
max-complexity = 18
Expand Down
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ default_language_version:
default_stages: [commit, push]
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks # general checks
rev: v4.5.0
rev: v4.6.0
hooks:
- id: check-added-large-files
args: ['--maxkb=1024']
Expand All @@ -31,7 +31,7 @@ repos:
- id: no-print-statements
files: ^oreum_core/
- repo: https://github.com/psf/black # black formatter
rev: 23.12.1
rev: 24.8.0
hooks:
- id: black
files: ^oreum_core/
Expand All @@ -41,26 +41,26 @@ repos:
- id: isort
files: ^oreum_core/
- repo: https://github.com/pycqa/flake8 # flake8 linter
rev: 7.0.0
rev: 7.1.0
hooks:
- id: flake8
files: ^oreum_core/
- repo: https://github.com/pycqa/bandit # basic security checks for python code
rev: 1.7.6
rev: 1.7.9
hooks:
- id: bandit
files: ^oreum_core/
args: ["--config", "pyproject.toml"]
additional_dependencies: ["bandit[toml]"]
- repo: https://github.com/econchick/interrogate # check for docstrings
rev: 1.5.0
rev: 1.7.0
hooks:
- id: interrogate
files: ^oreum_core/
args: [--config, pyproject.toml]
pass_filenames: false # see https://github.com/econchick/interrogate/issues/60#issuecomment-1180262851
- repo: https://gitlab.com/iam-cms/pre-commit-hooks # apply Apache2 header
rev: v0.4.0
rev: v0.6.0
hooks:
- id: apache-license
files: ^oreum_core/
Expand Down
347 changes: 176 additions & 171 deletions LICENSES_THIRD_PARTY.md

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions assets/img/interrogate_badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion oreum_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"""Core tools for use on projects by Oreum Industries"""
import logging

__version__ = "0.8.1"
__version__ = "0.9.0"

# logger goes to null handler by default
# packages that import oreum_core can override this and direct elsewhere
Expand Down
8 changes: 7 additions & 1 deletion oreum_core/curate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@

# curate/
"""Various classes & functions for data curation"""
from .data_io import PandasCSVIO, PandasParquetIO, SimpleStringIO, copy_csv2md
from .data_io import (
PandasCSVIO,
PandasExcelIO,
PandasParquetIO,
SimpleStringIO,
copy_csv2md,
)
from .data_transform import (
DatasetReshaper,
DatatypeConverter,
Expand Down
2 changes: 1 addition & 1 deletion oreum_core/curate/data_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __init__(self, *args, **kwargs):

def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
"""Read excel fn from rootdir, pass args kwargs to pd.read_excel"""
fn = Path(fn).with_suffix('.xslx')
fn = Path(fn).with_suffix('.xlsx')
fqn = self.get_path_read(fn)
_log.info(f'Read from {str(fqn.resolve())}')
return pd.read_excel(str(fqn), *args, **kwargs)
Expand Down
1 change: 1 addition & 0 deletions oreum_core/curate/data_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ def create_dfcmb(self, df: pd.DataFrame, ftsd: dict) -> pd.DataFrame:
dfcmb = pd.DataFrame(index=[0])
fts_factor = ftsd.get('fcat', []) + ftsd.get('fbool', [])
for ft in fts_factor:
ft = ft[2:-1] if ft[:2] == 'F(' else ft
colnames_pre = list(dfcmb.columns.values)
s = pd.Series(np.unique(df[ft]), name=ft)
dfcmb = pd.concat([dfcmb, s], axis=1, join='outer', ignore_index=True)
Expand Down
2 changes: 1 addition & 1 deletion oreum_core/eda/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
tril_nan,
)
from .describe import describe, display_fw, display_ht, get_fts_by_dtype
from .eda_io import FigureIO, display_image_file, output_data_dict
from .eda_io import FigureIO, output_data_dict
from .plot import ( # plot_umap,; plot_r2_range,; plot_r2_range_pair,
plot_accuracy,
plot_binary_performance,
Expand Down
42 changes: 24 additions & 18 deletions oreum_core/eda/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def describe(
limit: int = 50, # MB
get_mode: bool = False,
get_counts: bool = True,
get_cr94: bool = False,
reset_index: bool = True,
return_df: bool = False,
**kwargs,
Expand Down Expand Up @@ -68,7 +69,12 @@ def describe(
df = df.reset_index()

# start with pandas describe, add on dtypes
dfdesc = df.describe(include='all').T
quantiles = [0.25, 0.5, 0.75] # the default
percentile_names = ['25%', '50%', '75%']
if get_cr94:
quantiles = [0.03] + quantiles + [0.97]
percentile_names = ['3%'] + percentile_names + ['97%']
dfdesc = df.describe(include='all', percentiles=quantiles).T

dfout = pd.concat((dfdesc, df.dtypes), axis=1, join='outer', sort=False)
dfout = dfout.loc[df.columns.values]
Expand Down Expand Up @@ -100,23 +106,23 @@ def describe(
dfout.loc[ft, 'min'] = df[ft].value_counts().index.min()
dfout.loc[ft, 'max'] = df[ft].value_counts().index.max()

fts_out_all = [
'dtype',
'count_null',
'count_inf',
'count_zero',
'count_unique',
'top',
'freq',
'sum',
'mean',
'std',
'min',
'25%',
'50%',
'75%',
'max',
]
fts_out_all = (
[
'dtype',
'count_null',
'count_inf',
'count_zero',
'count_unique',
'top',
'freq',
'sum',
'mean',
'std',
'min',
]
+ percentile_names
+ ['max']
)
fts_out = [f for f in fts_out_all if f in dfout.columns.values]

# add mode and mode count WARNING takes forever for large arrays (>10k row)
Expand Down
87 changes: 51 additions & 36 deletions oreum_core/eda/eda_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,24 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import figure

from ..curate.data_io import PandasExcelIO
from ..utils.file_io import BaseFileIO
from .describe import describe, get_fts_by_dtype

__all__ = ['FigureIO', 'display_image_file', 'output_data_dict']
__all__ = ['FigureIO', 'output_data_dict']

_log = logging.getLogger(__name__)

sns.set_theme(
style='darkgrid',
palette='muted',
context='notebook',
rc={'figure.dpi': 72, 'savefig.dpi': 144, 'figure.figsize': (12, 4)},
)


class FigureIO(BaseFileIO):
"""Helper class to save matplotlib.figure.Figure objects to image file"""
Expand All @@ -47,41 +55,48 @@ def write(self, f: figure.Figure, fn: str, *args, **kwargs) -> Path:
_log.info(f'Written to {str(fqn.resolve())}')
return fqn


def display_image_file(
fqn: str, title: str = None, figsize: tuple = (12, 6)
) -> figure.Figure:
"""Hacky way to display pre-created image file in a Notebook
such that nbconvert can see it and render to PDF
Force to max width 16 inches, for fullwidth render in live Notebook and PDF

NOTE:
Alternatives are bad
1. This one is entirely missed by nbconvert at render to PDF
# <img src="img.jpg" style="float:center; width:900px" />

2. This one causes following markdown to render monospace in PDF
# from IPython.display import Image
# Image("./assets/img/oreum_eloss_blueprint3.jpg", retina=True)
"""
img = mpimg.imread(fqn)
f, axs = plt.subplots(1, 1, figsize=figsize)
_ = axs.imshow(img)
ax = plt.gca()
_ = ax.grid(False)
_ = ax.set_frame_on(False)
_ = plt.tick_params(
top=False,
bottom=False,
left=False,
right=False,
labelleft=False,
labelbottom=False,
)
if title is not None:
_ = f.suptitle(f'{title}', y=1.0)
_ = f.tight_layout()
return f
def read(
self,
fqn: Path = None,
fn: str = None,
extension: str = '.png',
title: str = None,
figsize: tuple = (12, 4),
) -> figure.Figure:
"""Hacky way to display pre-created image file in a Notebook such that
nbconvert can see it and render to PDF
If don't supply fqn, then this will build fn according to get_path_read
Render according to usual rcParams (set at module-level)
NOTE:
All the alternatives are bad
1. This one is entirely missed by nbconvert at render to PDF
# <img src="img.jpg" style="float:center; width:900px" />

2. This one causes following markdown to render monospace in PDF
# from IPython.display import Image
# Image("./assets/img/oreum_eloss_blueprint3.jpg", retina=True)
"""
if fn is not None:
fqn = self.get_path_read(Path(self.snl.clean(fn)).with_suffix(extension))
img = mpimg.imread(fqn)
f, axs = plt.subplots(1, 1, figsize=figsize)
_ = axs.imshow(img)
ax = plt.gca()
_ = ax.grid(False)
_ = ax.set_frame_on(False)
_ = plt.tick_params(
top=False,
bottom=False,
left=False,
right=False,
labelleft=False,
labelbottom=False,
)
if title is not None:
_ = f.suptitle(f'{title}', fontsize=12, y=1.0)
_ = f.tight_layout()
_log.info(f'Read image from {str(fqn.resolve())}')
return f


def output_data_dict(
Expand Down
13 changes: 10 additions & 3 deletions oreum_core/eda/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# eda.plot.py
"""EDA Plotting"""
import logging
from textwrap import wrap
from typing import Literal

import matplotlib.pyplot as plt
Expand Down Expand Up @@ -66,6 +65,13 @@
RSD = 42
rng = np.random.default_rng(seed=RSD)

sns.set_theme(
style='darkgrid',
palette='muted',
context='notebook',
rc={'figure.dpi': 72, 'savefig.dpi': 144, 'figure.figsize': (12, 4)},
)


def _get_kws_styling() -> dict:
"""Common styling kws for plots"""
Expand Down Expand Up @@ -867,8 +873,9 @@ def plot_estimate(
arr_overplot: np.array = None,
**kwargs,
) -> figure.Figure:
"""Plot distribution for estimates, either PPC or bootstrapped, no grouping
Optional overplot bootstrapped dfboot"""
"""Plot distribution for univariate estimates, either PPC or bootstrapped
no grouping. Optionally overplot bootstrapped dfboot"""
# TODO: Extend this to multivariate grouping
txtadd = kwargs.pop('txtadd', None)
sty = _get_kws_styling()
clr = color if color is not None else sns.color_palette()[0]
Expand Down
1 change: 1 addition & 0 deletions oreum_core/model_pymc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from .describe import (
describe_dist,
extract_yobs_yhat,
get_mdlvt_specific_nm,
get_summary,
model_desc,
print_rvs,
Expand Down
11 changes: 6 additions & 5 deletions oreum_core/model_pymc/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,10 @@ def __init__(self, **kwargs):
chains=4,
cores=4,
target_accept=0.8,
idata_kwargs={
"log_likelihood": True, # usually useful
## TODO only in 5.16 "log_prior": True, # possibly useful?
},
idata_kwargs=dict(
log_likelihood=True, # usually useful
log_prior=True, # possibly useful?
),
progressbar=True,
)
self.rvs_for_posterior_plots = []
Expand Down Expand Up @@ -263,7 +263,8 @@ def update_idata(self, idata: az.InferenceData, replace: bool = False) -> None:

def debug(self):
"""Convenience to run debug on logp and random, and
assert no MeasurableVariable nodes in the graph"""
assert no MeasurableVariable nodes in the graph
TODO catch these outputs in the log"""
if self.model is not None:
assert_no_rvs(self.model.logp())
_ = self.model.debug(fn='logp', verbose=True)
Expand Down
11 changes: 8 additions & 3 deletions oreum_core/model_pymc/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
import pytensor.gradient as tg
import pytensor.tensor as pt
from arviz import InferenceData, dict_to_dataset
from fastprogress import progress_bar

# from fastprogress import progress_bar
from pymc.backends.arviz import _DefaultTrace, coords_and_dims_for_inferencedata
from pymc.model import Model, modelcontext
from pymc.pytensorf import PointFunc
Expand Down Expand Up @@ -459,6 +460,10 @@ def compute_log_likelihood_for_potential(
orig: https://github.com/pymc-devs/pymc/blob/92278278d4a8b78f17ed0f101eb29d0d9982eb45/pymc/stats/log_likelihood.py#L29C1-L128C31
discussion: https://discourse.pymc.io/t/using-a-random-variable-as-observed/7184/10

IMPORTANT NOTE 2024-08-04 in the intervening time, the source function that
this copies / modifies has changed hugely - it's going to cause substantial
pain to update :S

---

Compute elemwise log_likelihood of model given InferenceData with posterior group
Expand Down Expand Up @@ -529,8 +534,8 @@ def compute_log_likelihood_for_potential(
n_pts = len(posterior_pts)
loglike_dict = _DefaultTrace(n_pts)
indices = range(n_pts)
if progressbar:
indices = progress_bar(indices, total=n_pts, display=progressbar)
# if progressbar:
# indices = progress_bar(indices, total=n_pts, display=progressbar)

for idx in indices:
loglikes_pts = elemwise_loglike_fn(posterior_pts[idx])
Expand Down
Loading