oreum-industries · jonsedar · Sep 23, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901
+ignore = E203, E266, W291, W293, F401, F403, E501, W503, W605, C901, E712
 max-line-length = 88
 max-doc-length = 144
 max-complexity = 18

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ default_language_version:
 default_stages: [commit, push]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks  # general checks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: check-added-large-files
         args: ['--maxkb=1024']
@@ -31,7 +31,7 @@ repos:
       - id: no-print-statements
         files: ^oreum_core/
   - repo: https://github.com/psf/black  # black formatter
-    rev: 23.12.1
+    rev: 24.8.0
     hooks:
       - id: black
         files: ^oreum_core/
@@ -41,26 +41,26 @@ repos:
       - id: isort
         files: ^oreum_core/
   - repo: https://github.com/pycqa/flake8  # flake8 linter
-    rev: 7.0.0
+    rev: 7.1.0
     hooks:
       - id: flake8
         files: ^oreum_core/
   - repo: https://github.com/pycqa/bandit  # basic security checks for python code
-    rev: 1.7.6
+    rev: 1.7.9
     hooks:
       - id: bandit
         files: ^oreum_core/
         args: ["--config", "pyproject.toml"]
         additional_dependencies: ["bandit[toml]"]
   - repo: https://github.com/econchick/interrogate  # check for docstrings
-    rev: 1.5.0
+    rev: 1.7.0
     hooks:
       - id: interrogate
         files: ^oreum_core/
         args: [--config, pyproject.toml]
         pass_filenames: false  # see https://github.com/econchick/interrogate/issues/60#issuecomment-1180262851
   - repo: https://gitlab.com/iam-cms/pre-commit-hooks  # apply Apache2 header
-    rev: v0.4.0
+    rev: v0.6.0
     hooks:
       - id: apache-license
         files: ^oreum_core/

diff --git a/LICENSES_THIRD_PARTY.md b/LICENSES_THIRD_PARTY.md
diff --git a/assets/img/interrogate_badge.svg b/assets/img/interrogate_badge.svg
diff --git a/oreum_core/__init__.py b/oreum_core/__init__.py
@@ -15,7 +15,7 @@
 """Core tools for use on projects by Oreum Industries"""
 import logging
 
-__version__ = "0.8.1"
+__version__ = "0.9.0"
 
 # logger goes to null handler by default
 # packages that import oreum_core can override this and direct elsewhere

diff --git a/oreum_core/curate/__init__.py b/oreum_core/curate/__init__.py
@@ -14,7 +14,13 @@
 
 # curate/
 """Various classes & functions for data curation"""
-from .data_io import PandasCSVIO, PandasParquetIO, SimpleStringIO, copy_csv2md
+from .data_io import (
+    PandasCSVIO,
+    PandasExcelIO,
+    PandasParquetIO,
+    SimpleStringIO,
+    copy_csv2md,
+)
 from .data_transform import (
     DatasetReshaper,
     DatatypeConverter,

diff --git a/oreum_core/curate/data_io.py b/oreum_core/curate/data_io.py
@@ -102,7 +102,7 @@ def __init__(self, *args, **kwargs):
 
     def read(self, fn: str, *args, **kwargs) -> pd.DataFrame:
         """Read excel fn from rootdir, pass args kwargs to pd.read_excel"""
-        fn = Path(fn).with_suffix('.xslx')
+        fn = Path(fn).with_suffix('.xlsx')
         fqn = self.get_path_read(fn)
         _log.info(f'Read from {str(fqn.resolve())}')
         return pd.read_excel(str(fqn), *args, **kwargs)

diff --git a/oreum_core/curate/data_transform.py b/oreum_core/curate/data_transform.py
@@ -228,6 +228,7 @@ def create_dfcmb(self, df: pd.DataFrame, ftsd: dict) -> pd.DataFrame:
         dfcmb = pd.DataFrame(index=[0])
         fts_factor = ftsd.get('fcat', []) + ftsd.get('fbool', [])
         for ft in fts_factor:
+            ft = ft[2:-1] if ft[:2] == 'F(' else ft
             colnames_pre = list(dfcmb.columns.values)
             s = pd.Series(np.unique(df[ft]), name=ft)
             dfcmb = pd.concat([dfcmb, s], axis=1, join='outer', ignore_index=True)

diff --git a/oreum_core/eda/__init__.py b/oreum_core/eda/__init__.py
@@ -27,7 +27,7 @@
     tril_nan,
 )
 from .describe import describe, display_fw, display_ht, get_fts_by_dtype
-from .eda_io import FigureIO, display_image_file, output_data_dict
+from .eda_io import FigureIO, output_data_dict
 from .plot import (  # plot_umap,; plot_r2_range,; plot_r2_range_pair,
     plot_accuracy,
     plot_binary_performance,

diff --git a/oreum_core/eda/describe.py b/oreum_core/eda/describe.py
@@ -36,6 +36,7 @@ def describe(
     limit: int = 50,  # MB
     get_mode: bool = False,
     get_counts: bool = True,
+    get_cr94: bool = False,
     reset_index: bool = True,
     return_df: bool = False,
     **kwargs,
@@ -68,7 +69,12 @@ def describe(
         df = df.reset_index()
 
     # start with pandas describe, add on dtypes
-    dfdesc = df.describe(include='all').T
+    quantiles = [0.25, 0.5, 0.75]  # the default
+    percentile_names = ['25%', '50%', '75%']
+    if get_cr94:
+        quantiles = [0.03] + quantiles + [0.97]
+        percentile_names = ['3%'] + percentile_names + ['97%']
+    dfdesc = df.describe(include='all', percentiles=quantiles).T
 
     dfout = pd.concat((dfdesc, df.dtypes), axis=1, join='outer', sort=False)
     dfout = dfout.loc[df.columns.values]
@@ -100,23 +106,23 @@ def describe(
             dfout.loc[ft, 'min'] = df[ft].value_counts().index.min()
             dfout.loc[ft, 'max'] = df[ft].value_counts().index.max()
 
-    fts_out_all = [
-        'dtype',
-        'count_null',
-        'count_inf',
-        'count_zero',
-        'count_unique',
-        'top',
-        'freq',
-        'sum',
-        'mean',
-        'std',
-        'min',
-        '25%',
-        '50%',
-        '75%',
-        'max',
-    ]
+    fts_out_all = (
+        [
+            'dtype',
+            'count_null',
+            'count_inf',
+            'count_zero',
+            'count_unique',
+            'top',
+            'freq',
+            'sum',
+            'mean',
+            'std',
+            'min',
+        ]
+        + percentile_names
+        + ['max']
+    )
     fts_out = [f for f in fts_out_all if f in dfout.columns.values]
 
     # add mode and mode count WARNING takes forever for large arrays (>10k row)

diff --git a/oreum_core/eda/eda_io.py b/oreum_core/eda/eda_io.py
@@ -21,16 +21,24 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import seaborn as sns
 from matplotlib import figure
 
 from ..curate.data_io import PandasExcelIO
 from ..utils.file_io import BaseFileIO
 from .describe import describe, get_fts_by_dtype
 
-__all__ = ['FigureIO', 'display_image_file', 'output_data_dict']
+__all__ = ['FigureIO', 'output_data_dict']
 
 _log = logging.getLogger(__name__)
 
+sns.set_theme(
+    style='darkgrid',
+    palette='muted',
+    context='notebook',
+    rc={'figure.dpi': 72, 'savefig.dpi': 144, 'figure.figsize': (12, 4)},
+)
+
 
 class FigureIO(BaseFileIO):
     """Helper class to save matplotlib.figure.Figure objects to image file"""
@@ -47,41 +55,48 @@ def write(self, f: figure.Figure, fn: str, *args, **kwargs) -> Path:
         _log.info(f'Written to {str(fqn.resolve())}')
         return fqn
 
-
-def display_image_file(
-    fqn: str, title: str = None, figsize: tuple = (12, 6)
-) -> figure.Figure:
-    """Hacky way to display pre-created image file in a Notebook
-    such that nbconvert can see it and render to PDF
-    Force to max width 16 inches, for fullwidth render in live Notebook and PDF
-
-    NOTE:
-    Alternatives are bad
-        1. This one is entirely missed by nbconvert at render to PDF
-        # <img src="img.jpg" style="float:center; width:900px" />
-
-        2. This one causes following markdown to render monospace in PDF
-        # from IPython.display import Image
-        # Image("./assets/img/oreum_eloss_blueprint3.jpg", retina=True)
-    """
-    img = mpimg.imread(fqn)
-    f, axs = plt.subplots(1, 1, figsize=figsize)
-    _ = axs.imshow(img)
-    ax = plt.gca()
-    _ = ax.grid(False)
-    _ = ax.set_frame_on(False)
-    _ = plt.tick_params(
-        top=False,
-        bottom=False,
-        left=False,
-        right=False,
-        labelleft=False,
-        labelbottom=False,
-    )
-    if title is not None:
-        _ = f.suptitle(f'{title}', y=1.0)
-    _ = f.tight_layout()
-    return f
+    def read(
+        self,
+        fqn: Path = None,
+        fn: str = None,
+        extension: str = '.png',
+        title: str = None,
+        figsize: tuple = (12, 4),
+    ) -> figure.Figure:
+        """Hacky way to display pre-created image file in a Notebook such that
+        nbconvert can see it and render to PDF
+        If don't supply fqn, then this will build fn according to get_path_read
+        Render according to usual rcParams (set at module-level)
+        NOTE:
+        All the alternatives are bad
+            1. This one is entirely missed by nbconvert at render to PDF
+            # <img src="img.jpg" style="float:center; width:900px" />
+
+            2. This one causes following markdown to render monospace in PDF
+            # from IPython.display import Image
+            # Image("./assets/img/oreum_eloss_blueprint3.jpg", retina=True)
+        """
+        if fn is not None:
+            fqn = self.get_path_read(Path(self.snl.clean(fn)).with_suffix(extension))
+        img = mpimg.imread(fqn)
+        f, axs = plt.subplots(1, 1, figsize=figsize)
+        _ = axs.imshow(img)
+        ax = plt.gca()
+        _ = ax.grid(False)
+        _ = ax.set_frame_on(False)
+        _ = plt.tick_params(
+            top=False,
+            bottom=False,
+            left=False,
+            right=False,
+            labelleft=False,
+            labelbottom=False,
+        )
+        if title is not None:
+            _ = f.suptitle(f'{title}', fontsize=12, y=1.0)
+        _ = f.tight_layout()
+        _log.info(f'Read image from {str(fqn.resolve())}')
+        return f
 
 
 def output_data_dict(

diff --git a/oreum_core/eda/plot.py b/oreum_core/eda/plot.py
@@ -15,7 +15,6 @@
 # eda.plot.py
 """EDA Plotting"""
 import logging
-from textwrap import wrap
 from typing import Literal
 
 import matplotlib.pyplot as plt
@@ -66,6 +65,13 @@
 RSD = 42
 rng = np.random.default_rng(seed=RSD)
 
+sns.set_theme(
+    style='darkgrid',
+    palette='muted',
+    context='notebook',
+    rc={'figure.dpi': 72, 'savefig.dpi': 144, 'figure.figsize': (12, 4)},
+)
+
 
 def _get_kws_styling() -> dict:
     """Common styling kws for plots"""
@@ -867,8 +873,9 @@ def plot_estimate(
     arr_overplot: np.array = None,
     **kwargs,
 ) -> figure.Figure:
-    """Plot distribution for estimates, either PPC or bootstrapped, no grouping
-    Optional overplot bootstrapped dfboot"""
+    """Plot distribution for univariate estimates, either PPC or bootstrapped
+    no grouping. Optionally overplot bootstrapped dfboot"""
+    # TODO: Extend this to multivariate grouping
     txtadd = kwargs.pop('txtadd', None)
     sty = _get_kws_styling()
     clr = color if color is not None else sns.color_palette()[0]

diff --git a/oreum_core/model_pymc/__init__.py b/oreum_core/model_pymc/__init__.py
@@ -32,6 +32,7 @@
 from .describe import (
     describe_dist,
     extract_yobs_yhat,
+    get_mdlvt_specific_nm,
     get_summary,
     model_desc,
     print_rvs,

diff --git a/oreum_core/model_pymc/base.py b/oreum_core/model_pymc/base.py
@@ -63,10 +63,10 @@ def __init__(self, **kwargs):
             chains=4,
             cores=4,
             target_accept=0.8,
-            idata_kwargs={
-                "log_likelihood": True,  # usually useful
-                ## TODO only in 5.16 "log_prior": True,  # possibly useful?
-            },
+            idata_kwargs=dict(
+                log_likelihood=True,  # usually useful
+                log_prior=True,  # possibly useful?
+            ),
             progressbar=True,
         )
         self.rvs_for_posterior_plots = []
@@ -263,7 +263,8 @@ def update_idata(self, idata: az.InferenceData, replace: bool = False) -> None:
 
     def debug(self):
         """Convenience to run debug on logp and random, and
-        assert no MeasurableVariable nodes in the graph"""
+        assert no MeasurableVariable nodes in the graph
+        TODO catch these outputs in the log"""
         if self.model is not None:
             assert_no_rvs(self.model.logp())
             _ = self.model.debug(fn='logp', verbose=True)

diff --git a/oreum_core/model_pymc/calc.py b/oreum_core/model_pymc/calc.py
@@ -24,7 +24,8 @@
 import pytensor.gradient as tg
 import pytensor.tensor as pt
 from arviz import InferenceData, dict_to_dataset
-from fastprogress import progress_bar
+
+# from fastprogress import progress_bar
 from pymc.backends.arviz import _DefaultTrace, coords_and_dims_for_inferencedata
 from pymc.model import Model, modelcontext
 from pymc.pytensorf import PointFunc
@@ -459,6 +460,10 @@ def compute_log_likelihood_for_potential(
     orig: https://github.com/pymc-devs/pymc/blob/92278278d4a8b78f17ed0f101eb29d0d9982eb45/pymc/stats/log_likelihood.py#L29C1-L128C31
     discussion: https://discourse.pymc.io/t/using-a-random-variable-as-observed/7184/10
 
+    IMPORTANT NOTE 2024-08-04 in the intervening time, the source function that
+    this copies / modifies has changed hugely - it's going to cause substantial
+    pain to update :S
+
     ---
 
     Compute elemwise log_likelihood of model given InferenceData with posterior group
@@ -529,8 +534,8 @@ def compute_log_likelihood_for_potential(
     n_pts = len(posterior_pts)
     loglike_dict = _DefaultTrace(n_pts)
     indices = range(n_pts)
-    if progressbar:
-        indices = progress_bar(indices, total=n_pts, display=progressbar)
+    # if progressbar:
+    #     indices = progress_bar(indices, total=n_pts, display=progressbar)
 
     for idx in indices:
         loglikes_pts = elemwise_loglike_fn(posterior_pts[idx])