scikit-learn-contrib · hlbotterman · Oct 2, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/docs/analysis.rst b/docs/analysis.rst
@@ -11,7 +11,7 @@ The analysis module provides tools to characterize the type of holes.
 
 The MNAR case is the trickiest, the user must first consider whether their missing data mechanism is MNAR. In the meantime, we make assume that the missing-data mechanism is ignorable (ie., it is not MNAR). If an MNAR mechanism is suspected, please see this article :ref:`An approach to test for MNAR [1]<Noonan-article>` for relevant actions.
 
-Then Qolmat proposes a test to determine whether the missing data mechanism is MCAR or MAR.
+Then Qolmat proposes two tests to determine whether the missing data mechanism is MCAR or MAR.
 
 2. How to use the results
 -------------------------
@@ -45,12 +45,16 @@ The MCAR missing-data mechanism means that there is independence between the pre
 a. Little's Test
 ^^^^^^^^^^^^^^^^
 
-The best-known MCAR test is the :ref:`Little [2]<Little-article>` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns.
+The best-known MCAR test is the :ref:`Little [1]<Little-article>` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns.
 
 b. PKLM Test
 ^^^^^^^^^^^^
 
-The :ref:`PKLM [2]<PKLM-article>` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. It is not implemented yet in Qolmat.
+The :ref:`PKLM [2]<PKLM-article>` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. The :class:`PKLMTest` is now implemented in Qolmat.
+To carry out this test, we perform random projections in the variable space of the data. These random projections allow us to construct a fully observed sub-matrix and an associated number of missing patterns.
+The idea is then to compare the distributions of the missing patterns through the Kullback-Leibler distance.
+To do this, the distributions for each pattern are estimated using Random Forests.
+
 
 References
 ----------

diff --git a/docs/images/schema_qolmat.png b/docs/images/schema_qolmat.png
diff --git a/examples/tutorials/plot_tuto_benchmark_TS.py b/examples/tutorials/plot_tuto_benchmark_TS.py
@@ -78,7 +78,9 @@
 ratio_masked = 0.1
 
 imputer_median = imputers.ImputerSimple(groups=("station",), strategy="median")
-imputer_interpol = imputers.ImputerInterpolation(groups=("station",), method="linear")
+imputer_interpol = imputers.ImputerInterpolation(
+    groups=("station",), method="linear"
+)
 imputer_residuals = imputers.ImputerResiduals(
     groups=("station",),
     period=365,
@@ -103,7 +105,10 @@
 )
 
 generator_holes = missing_patterns.EmpiricalHoleGenerator(
-    n_splits=4, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked
+    n_splits=4,
+    groups=("station",),
+    subset=cols_to_impute,
+    ratio_masked=ratio_masked,
 )
 
 dict_imputers = {
@@ -142,11 +147,17 @@
 # Aotizhongxin
 
 df_plot = df[cols_to_impute]
-dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
+dfs_imputed = {
+    name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()
+}
 station = "Aotizhongxin"
 df_station = df_plot.loc[station]
-dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
-fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute)))
+dfs_imputed_station = {
+    name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()
+}
+fig, axs = plt.subplots(
+    3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute))
+)
 for col, ax in zip(cols_to_impute, axs.flatten()):
     values_orig = df_station[col]
     ax.plot(values_orig, ".", color="black", label="original")
@@ -174,7 +185,9 @@
 fig = plt.figure(figsize=(10, 10))
 i_plot = 1
 for i, col in enumerate(cols_to_impute[:-1]):
-    for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed_station.items()):
+    for i_imputer, (name_imputer, df_imp) in enumerate(
+        dfs_imputed_station.items()
+    ):
         ax = fig.add_subplot(n_columns, n_imputers, i_plot)
         plot.compare_covariances(
             df_station,

diff --git a/examples/tutorials/plot_tuto_diffusion_models.py b/examples/tutorials/plot_tuto_diffusion_models.py
@@ -66,7 +66,11 @@
 df_data_valid = df_data.iloc[:500]
 
 tabddpm = ImputerDiffusion(
-    model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True
+    model=TabDDPM(),
+    epochs=10,
+    batch_size=100,
+    x_valid=df_data_valid,
+    print_valid=True,
 )
 tabddpm = tabddpm.fit(df_data)
 
@@ -150,8 +154,12 @@
 # reconstruction errors (mae) but increases distribution distance (KL_columnwise).
 
 dict_imputers = {
-    "num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
-    "num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100),
+    "num_sampling=5": ImputerDiffusion(
+        model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
+    ),
+    "num_sampling=10": ImputerDiffusion(
+        model=TabDDPM(num_sampling=10), epochs=10, batch_size=100
+    ),
 }
 
 comparison = comparator.Comparator(
@@ -196,7 +204,9 @@
 #   but requires a longer training/inference time.
 
 dict_imputers = {
-    "tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
+    "tabddpm": ImputerDiffusion(
+        model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
+    ),
     "tsddpm": ImputerDiffusion(
         model=TsDDPM(num_sampling=5, is_rolling=False),
         epochs=10,

diff --git a/examples/tutorials/plot_tuto_hole_generator.py b/examples/tutorials/plot_tuto_hole_generator.py
@@ -14,6 +14,7 @@
 It consists in hourly air pollutants data from 12 chinese nationally-controlled
 air-quality monitoring sites.
 """
+
 from typing import List
 
 import matplotlib
@@ -49,7 +50,9 @@
 # Missing values are in white, while observed ones are in black.
 
 plt.figure(figsize=(15, 4))
-plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none")
+plt.imshow(
+    df.notna().values.T, aspect="auto", cmap="binary", interpolation="none"
+)
 plt.yticks(range(len(df.columns)), df.columns)
 plt.xlabel("Samples", fontsize=12)
 plt.grid(False)
@@ -96,7 +99,9 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame):
     colorsList = [(0.9, 0, 0), (0, 0, 0), (0.8, 0.8, 0.8)]
     custom_cmap = matplotlib.colors.ListedColormap(colorsList)
     plt.figure(figsize=(15, 4))
-    plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none")
+    plt.imshow(
+        df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none"
+    )
     plt.yticks(range(len(df_tot.columns)), df_tot.columns)
     plt.xlabel("Samples", fontsize=12)
     plt.grid(False)
@@ -156,7 +161,9 @@ def plot_cdf(
     _, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3))
 
     hole_sizes_original = get_holes_sizes_column_wise(df.to_numpy())
-    for ind, (hole_original, col) in enumerate(zip(hole_sizes_original, df.columns)):
+    for ind, (hole_original, col) in enumerate(
+        zip(hole_sizes_original, df.columns)
+    ):
         sorted_data = np.sort(hole_original)
         cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
         axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original")
@@ -166,7 +173,9 @@ def plot_cdf(
         array_mask[array_mask == True] = np.nan
         hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy())
 
-        for ind, (hole_created, col) in enumerate(zip(hole_sizes_created, df.columns)):
+        for ind, (hole_created, col) in enumerate(
+            zip(hole_sizes_created, df.columns)
+        ):
             sorted_data = np.sort(hole_created)
             cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
             axs[ind].plot(sorted_data, cdf, c=color, lw=2, label=label)
@@ -309,7 +318,13 @@ def plot_cdf(
 
 plot_cdf(
     df,
-    [uniform_mask, geometric_mask, empirical_mask, multi_markov_mask, grouped_mask],
+    [
+        uniform_mask,
+        geometric_mask,
+        empirical_mask,
+        multi_markov_mask,
+        grouped_mask,
+    ],
     ["uniform", "geometric", "empirical", "mutli markov", "grouped"],
     ["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"],
 )