Skip to content

Commit be61d64

Browse files
Julien RousselJulien Roussel
authored andcommitted
tutos made reproducible
1 parent 3185f8c commit be61d64

19 files changed

+135
-83
lines changed

examples/tutorials/plot_tuto_benchmark_TS.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
from qolmat.benchmark import comparator, missing_patterns
2626
from qolmat.imputations import imputers
2727
from qolmat.utils import data, plot
28+
from sklearn import utils as sku
29+
30+
seed = 1234
31+
rng = sku.check_random_state(seed)
2832

2933
# %%
3034
# 1. Data
@@ -96,6 +100,7 @@
96100
n_iter_ou=15,
97101
dt=1e-3,
98102
p=1,
103+
random_state=rng
99104
)
100105
imputer_mice = imputers.ImputerMICE(
101106
groups=("station",),
@@ -109,6 +114,7 @@
109114
groups=("station",),
110115
subset=cols_to_impute,
111116
ratio_masked=ratio_masked,
117+
random_state=rng
112118
)
113119

114120
dict_imputers = {

examples/tutorials/plot_tuto_categorical.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88
"""
99

1010
from sklearn.pipeline import Pipeline
11+
from sklearn import utils as sku
1112

1213
from qolmat.benchmark import comparator, missing_patterns
1314
from qolmat.imputations import imputers, preprocessing
1415
from qolmat.imputations.imputers import ImputerRegressor
1516
from qolmat.utils import data
1617

18+
seed = 1234
19+
rng = sku.check_random_state(seed)
20+
1721
# %%
1822
# 1. Titanic dataset
1923
# ---------------------------------------------------------------
@@ -39,7 +43,7 @@
3943

4044
cols_num = df.select_dtypes(include="number").columns
4145
cols_cat = df.select_dtypes(exclude="number").columns
42-
imputer_rpca = imputers.ImputerRpcaNoisy()
46+
imputer_rpca = imputers.ImputerRpcaNoisy(random_state=rng)
4347
ohe = preprocessing.OneHotEncoderProjector(
4448
handle_unknown="ignore",
4549
handle_missing="return_nan",
@@ -58,7 +62,7 @@
5862
# - manage missing features (native to the HistGradientBoosting)
5963

6064
pipestimator = preprocessing.make_robust_MixteHGB(avoid_new=True)
61-
imputer_hgb = ImputerRegressor(estimator=pipestimator, handler_nan="none")
65+
imputer_hgb = ImputerRegressor(estimator=pipestimator, handler_nan="none", random_state=rng)
6266
imputer_wrap_hgb = preprocessing.WrapperTransformer(imputer_hgb, bt)
6367

6468
# %%
@@ -79,6 +83,7 @@
7983
subset=cols_to_impute,
8084
ratio_masked=ratio_masked,
8185
sample_proportional=False,
86+
random_state=rng
8287
)
8388
metrics = ["rmse", "accuracy"]
8489

examples/tutorials/plot_tuto_diffusion_models.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,15 @@
99
import matplotlib.pyplot as plt
1010
import numpy as np
1111
import pandas as pd
12+
from sklearn import utils as sku
1213

1314
from qolmat.benchmark import comparator, missing_patterns
14-
from qolmat.imputations.diffusions.ddpms import TabDDPM, TsDDPM
1515
from qolmat.imputations.imputers_pytorch import ImputerDiffusion
1616
from qolmat.utils import data
1717

18+
seed = 1234
19+
rng = sku.check_random_state(seed)
20+
1821
logging.basicConfig(
1922
format="%(asctime)s %(levelname)-8s %(message)s",
2023
level=logging.INFO,
@@ -31,7 +34,7 @@
3134
# For this tutorial, we only use a small subset of this data
3235
# 1000 rows and 2 features (TEMP, PRES).
3336

34-
df_data = data.get_data_corrupted("Beijing")
37+
df_data = data.get_data_corrupted("Beijing", random_state=rng)
3538
df_data = df_data[["TEMP", "PRES"]].iloc[:1000]
3639
df_data.index = df_data.index.set_levels(
3740
[df_data.index.levels[0], pd.to_datetime(df_data.index.levels[1])]
@@ -75,6 +78,7 @@
7578
batch_size=100,
7679
x_valid=df_data_valid,
7780
print_valid=True,
81+
random_state=rng,
7882
)
7983
tabddpm = tabddpm.fit(df_data)
8084

@@ -159,19 +163,19 @@
159163
# reconstruction errors (mae) but increases distribution distance (kl_columnwise).
160164

161165
dict_imputers = {
162-
"num_sampling=5": ImputerDiffusion(epochs=10, batch_size=100, num_sampling=5),
163-
"num_sampling=10": ImputerDiffusion(epochs=10, batch_size=100, num_sampling=10),
166+
"num_sampling=5": ImputerDiffusion(epochs=10, batch_size=100, num_sampling=5, random_state=rng),
167+
"num_sampling=10": ImputerDiffusion(epochs=10, batch_size=100, num_sampling=10, random_state=rng),
164168
}
165169

166170
comparison = comparator.Comparator(
167171
dict_imputers,
168172
selected_columns=df_data.columns,
169-
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2),
173+
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
170174
metrics=["mae", "kl_columnwise"],
171175
)
172176
results = comparison.compare(df_data)
173177

174-
results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean()
178+
results.groupby(level=0).mean().groupby(level=0).mean()
175179

176180
# %%
177181
# 4. Hyperparameters for TsDDPM
@@ -205,7 +209,7 @@
205209
# but requires a longer training/inference time.
206210

207211
dict_imputers = {
208-
"tabddpm": ImputerDiffusion(model="TabDDPM", epochs=10, batch_size=100, num_sampling=5
212+
"tabddpm": ImputerDiffusion(model="TabDDPM", epochs=10, batch_size=100, num_sampling=5, random_state=rng
209213
),
210214
"tsddpm": ImputerDiffusion(
211215
model="TsDDPM",
@@ -214,19 +218,19 @@
214218
index_datetime="date",
215219
freq_str="5D",
216220
num_sampling=5,
217-
is_rolling=False
221+
is_rolling=False, random_state=rng
218222
),
219223
}
220224

221225
comparison = comparator.Comparator(
222226
dict_imputers,
223227
selected_columns=df_data.columns,
224-
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2),
228+
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
225229
metrics=["mae", "kl_columnwise"],
226230
)
227231
results = comparison.compare(df_data)
228232

229-
results.groupby(axis=0, level=0).mean().groupby(axis=0, level=0).mean()
233+
results.groupby(level=0).mean().groupby(level=0).mean()
230234

231235
# %%
232236
# [1] Ho, Jonathan, Ajay Jain, and Pieter Abbeel. `Denoising diffusion probabilistic models.

examples/tutorials/plot_tuto_hole_generator.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,15 @@
2121
import matplotlib.pyplot as plt
2222
import numpy as np
2323
import pandas as pd
24+
from sklearn import utils as sku
25+
from torch import rand
2426

2527
from qolmat.benchmark import missing_patterns
2628
from qolmat.utils import data
2729

30+
seed = 1234
31+
rng = sku.check_random_state(seed)
32+
2833
# %%
2934
# 1. Data
3035
# ---------------------------------------------------------------
@@ -42,7 +47,7 @@
4247
columns = ["TEMP", "PRES", "DEWP", "RAIN", "WSPM"]
4348
df_data = df_data[columns]
4449

45-
df = data.add_holes(df_data, ratio_masked=0.2, mean_size=120)
50+
df = data.add_holes(df_data, ratio_masked=0.2, mean_size=120, random_state=rng)
4651
cols_to_impute = df.columns
4752

4853
# %%
@@ -169,8 +174,8 @@ def plot_cdf(
169174
axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original")
170175

171176
for df_mask, label, color in zip(list_df_mask, labels, colors):
172-
array_mask = df_mask.copy()
173-
array_mask[array_mask == True] = np.nan
177+
array_mask = df_mask.astype(float).copy()
178+
array_mask[df_mask] = np.nan
174179
hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy())
175180

176181
for ind, (hole_created, col) in enumerate(
@@ -197,7 +202,7 @@ def plot_cdf(
197202
# Note this class is more suited for tabular datasets.
198203

199204
uniform_generator = missing_patterns.UniformHoleGenerator(
200-
n_splits=1, subset=df.columns, ratio_masked=0.1
205+
n_splits=1, subset=df.columns, ratio_masked=0.1, random_state=rng
201206
)
202207
uniform_mask = uniform_generator.split(df)[0]
203208

@@ -223,7 +228,7 @@ def plot_cdf(
223228
# :class:`~qolmat.benchmark.missing_patterns.UniformHoleGenerator` class.
224229

225230
geometric_generator = missing_patterns.GeometricHoleGenerator(
226-
n_splits=1, subset=cols_to_impute, ratio_masked=0.1
231+
n_splits=1, subset=cols_to_impute, ratio_masked=0.1, random_state=rng
227232
)
228233
geometric_mask = geometric_generator.split(df)[0]
229234

@@ -249,7 +254,7 @@ def plot_cdf(
249254
# is learned on each group: here on each station.
250255

251256
empirical_generator = missing_patterns.EmpiricalHoleGenerator(
252-
n_splits=1, subset=df.columns, ratio_masked=0.1, groups=("station",)
257+
n_splits=1, subset=df.columns, ratio_masked=0.1, groups=("station",), random_state=rng
253258
)
254259
empirical_mask = empirical_generator.split(df)[0]
255260

@@ -274,7 +279,7 @@ def plot_cdf(
274279
# :class:`~qolmat.benchmark.missing_patterns.MultiMarkovHoleGenerator` class.
275280

276281
multi_markov_generator = missing_patterns.MultiMarkovHoleGenerator(
277-
n_splits=1, subset=df.columns, ratio_masked=0.1
282+
n_splits=1, subset=df.columns, ratio_masked=0.1, random_state=rng
278283
)
279284
multi_markov_mask = multi_markov_generator.split(df)[0]
280285

@@ -297,7 +302,7 @@ def plot_cdf(
297302
# :class:`~qolmat.benchmark.missing_patterns.GroupedHoleGenerator` class.
298303

299304
grouped_generator = missing_patterns.GroupedHoleGenerator(
300-
n_splits=1, subset=df.columns, ratio_masked=0.1, groups=("station",)
305+
n_splits=1, subset=df.columns, ratio_masked=0.1, groups=("station",), random_state=rng
301306
)
302307
grouped_mask = grouped_generator.split(df)[0]
303308

examples/tutorials/plot_tuto_mcar.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,19 @@
1313
import numpy as np
1414
import pandas as pd
1515
from scipy.stats import norm
16+
from sklearn import utils as sku
1617

1718
from qolmat.analysis.holes_characterization import LittleTest, PKLMTest
1819
from qolmat.benchmark.missing_patterns import UniformHoleGenerator
1920

2021
plt.rcParams.update({"font.size": 12})
21-
22+
seed = 1234
23+
rng = sku.check_random_state(seed)
2224

2325
# %%
2426
# Generating random data
2527
# ----------------------
2628

27-
rng = np.random.RandomState(42)
2829
data = rng.multivariate_normal(mean=[0, 0], cov=[[1, 0], [0, 1]], size=200)
2930
df = pd.DataFrame(data=data, columns=["Column 1", "Column 2"])
3031

examples/tutorials/plot_tuto_mean_median.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
55
In this tutorial, we show how to use the Qolmat comparator
66
(:class:`~qolmat.benchmark.comparator`) to choose
7-
the best imputation between imputation by the mean or the median
7+
the best imputation between two of the simplest imputation methods: mean or median
88
(:class:`~qolmat.imputations.imputers.ImputerSimple`).
99
The dataset used is the the numerical `superconduct` dataset and
1010
contains information on 21263 superconductors.
@@ -15,11 +15,15 @@
1515
import matplotlib
1616
import matplotlib.pyplot as plt
1717
import numpy as np
18+
from sklearn import utils as sku
1819

1920
from qolmat.benchmark import comparator, missing_patterns
2021
from qolmat.imputations import imputers
2122
from qolmat.utils import data, plot
2223

24+
seed = 1234
25+
rng = sku.check_random_state(seed)
26+
2327
# %%
2428
# 1. Data
2529
# ---------------------------------------------------------------
@@ -33,7 +37,7 @@
3337
# In this way, each column has missing values.
3438

3539
df = data.add_holes(
36-
data.get_data("Superconductor"), ratio_masked=0.2, mean_size=120
40+
data.get_data("Superconductor"), ratio_masked=0.2, mean_size=120, random_state=rng
3741
)
3842

3943
# %%
@@ -91,7 +95,7 @@
9195
# ``subset=cols_to_impute``:
9296

9397
generator_holes = missing_patterns.UniformHoleGenerator(
94-
n_splits=2, subset=cols_to_impute, ratio_masked=0.1
98+
n_splits=2, subset=cols_to_impute, ratio_masked=0.1, random_state=rng
9599
)
96100
df_mask = generator_holes.generate_mask(df)
97101
df_mask = np.invert(df_mask).astype("int")

qolmat/analysis/holes_characterization.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from qolmat.imputations.imputers import ImputerEM
1616
from qolmat.utils.input_check import check_pd_df_dtypes
17+
from qolmat.utils.utils import RandomSetting
1718

1819

1920
class McarTest(ABC):
@@ -32,9 +33,7 @@ class McarTest(ABC):
3233
3334
"""
3435

35-
def __init__(
36-
self, random_state: Union[None, int, np.random.RandomState] = None
37-
):
36+
def __init__(self, random_state: RandomSetting = None):
3837
"""Initialize the McarTest class with a random state.
3938
4039
Parameters
@@ -95,7 +94,7 @@ class LittleTest(McarTest):
9594
def __init__(
9695
self,
9796
imputer: Optional[ImputerEM] = None,
98-
random_state: Union[None, int, np.random.RandomState] = None,
97+
random_state: RandomSetting = None,
9998
):
10099
super().__init__()
101100
if imputer and imputer.model != "multinormal":
@@ -203,7 +202,7 @@ def __init__(
203202
nb_trees_per_proj: int = 200,
204203
compute_partial_p_values: bool = False,
205204
encoder: Union[None, OneHotEncoder] = None,
206-
random_state: Union[None, int, np.random.RandomState] = None,
205+
random_state: RandomSetting = None,
207206
):
208207
super().__init__(random_state=random_state)
209208
self.nb_projections = nb_projections

0 commit comments

Comments
 (0)