|
1 | 1 | import numpy as np # noqa: D100
|
2 | 2 | import pandas as pd
|
3 | 3 |
|
4 |
| - |
5 |
| -def moy_p(V, weights): |
6 |
| - """Compute the weighted mean of a vector, ignoring NaNs. |
7 |
| -
|
8 |
| - Parameters |
9 |
| - ---------- |
10 |
| - V : array-like |
11 |
| - Input vector with possible NaN values. |
12 |
| - weights : array-like |
13 |
| - Weights corresponding to each element in V. |
14 |
| -
|
15 |
| - Returns |
16 |
| - ------- |
17 |
| - float |
18 |
| - Weighted mean of non-NaN elements. |
19 |
| -
|
20 |
| - """ |
21 |
| - mask = ~np.isnan(V) |
22 |
| - total_weight = np.sum(weights[mask]) |
23 |
| - if total_weight == 0: |
24 |
| - return 0.0 # or use np.finfo(float).eps for a small positive value |
25 |
| - return np.sum(V[mask] * weights[mask]) / total_weight |
26 |
| - |
27 |
| - |
28 |
| -def tab_disjonctif_NA(df): |
29 |
| - """Create a disjunctive (one-hot encoded). |
30 |
| -
|
31 |
| - Parameters |
32 |
| - ---------- |
33 |
| - df : DataFrame |
34 |
| - Input DataFrame with categorical and numeric variables. |
35 |
| -
|
36 |
| - Returns |
37 |
| - ------- |
38 |
| - DataFrame |
39 |
| - Disjunctive table with one-hot encoding. |
40 |
| -
|
41 |
| - """ # noqa: E501 |
42 |
| - df_encoded_list = [] |
43 |
| - for col in df.columns: |
44 |
| - if df[col].dtype.name == "category" or df[col].dtype == object: |
45 |
| - df[col] = df[col].astype("category") |
46 |
| - # Include '__MISSING__' as a category if not already present |
47 |
| - if "__MISSING__" not in df[col].cat.categories: |
48 |
| - df[col] = df[col].cat.add_categories(["__MISSING__"]) |
49 |
| - # Fill missing values with '__MISSING__' |
50 |
| - df[col] = df[col].fillna("__MISSING__") |
51 |
| - # One-hot encode the categorical variable |
52 |
| - encoded = pd.get_dummies( |
53 |
| - df[col], |
54 |
| - prefix=col, |
55 |
| - prefix_sep="_", |
56 |
| - dummy_na=False, |
57 |
| - dtype=float, |
58 |
| - ) |
59 |
| - df_encoded_list.append(encoded) |
60 |
| - else: |
61 |
| - # Numeric column; keep as is |
62 |
| - df_encoded_list.append(df[[col]]) |
63 |
| - # Concatenate all encoded columns |
64 |
| - df_encoded = pd.concat(df_encoded_list, axis=1) |
65 |
| - return df_encoded |
66 |
| - |
67 |
| - |
68 |
| -def tab_disjonctif_prop(df, seed=None): |
69 |
| - """Perform probabilistic imputation for categorical columns using observed |
70 |
| - value distributions, without creating a separate missing category. |
71 |
| -
|
72 |
| - Parameters |
73 |
| - ---------- |
74 |
| - df : DataFrame |
75 |
| - DataFrame with categorical columns to impute. |
76 |
| - seed : int, optional |
77 |
| - Random seed for reproducibility. Default is None. |
78 |
| -
|
79 |
| - Returns |
80 |
| - ------- |
81 |
| - DataFrame |
82 |
| - Disjunctive coded DataFrame with missing values probabilistically |
83 |
| - imputed. |
84 |
| -
|
85 |
| - """ # noqa: D205 |
86 |
| - if seed is not None: |
87 |
| - np.random.seed(seed) |
88 |
| - df = df.copy() |
89 |
| - df_encoded_list = [] |
90 |
| - for col in df.columns: |
91 |
| - if df[col].dtype.name == "category" or df[col].dtype == object: |
92 |
| - # Ensure categories are strings |
93 |
| - df[col] = df[col].cat.rename_categories( |
94 |
| - df[col].cat.categories.astype(str) |
95 |
| - ) |
96 |
| - observed = df[col][df[col].notna()] |
97 |
| - categories = df[col].cat.categories.tolist() |
98 |
| - # Get observed frequencies |
99 |
| - freqs = observed.value_counts(normalize=True) |
100 |
| - # Impute missing values based on observed frequencies |
101 |
| - missing_indices = df[col][df[col].isna()].index |
102 |
| - if len(missing_indices) > 0: |
103 |
| - imputed_values = np.random.choice( |
104 |
| - freqs.index, size=len(missing_indices), p=freqs.values |
105 |
| - ) |
106 |
| - df.loc[missing_indices, col] = imputed_values |
107 |
| - # One-hot encode without creating missing category |
108 |
| - encoded = pd.get_dummies( |
109 |
| - df[col], |
110 |
| - prefix=col, |
111 |
| - prefix_sep="_", |
112 |
| - dummy_na=False, |
113 |
| - dtype=float, |
114 |
| - ) |
115 |
| - col_names = [f"{col}_{cat}" for cat in categories] |
116 |
| - encoded = encoded.reindex(columns=col_names, fill_value=0.0) |
117 |
| - df_encoded_list.append(encoded) |
118 |
| - else: |
119 |
| - df_encoded_list.append(df[[col]]) |
120 |
| - df_encoded = pd.concat(df_encoded_list, axis=1) |
121 |
| - return df_encoded |
122 |
| - |
123 |
| - |
124 |
| -def find_category(df_original, tab_disj): |
125 |
| - """Reconstruct the original categorical variables from the disjunctive. |
126 |
| -
|
127 |
| - Parameters |
128 |
| - ---------- |
129 |
| - df_original : DataFrame |
130 |
| - Original DataFrame with categorical variables. |
131 |
| - tab_disj : DataFrame |
132 |
| - Disjunctive table after imputation. |
133 |
| -
|
134 |
| - Returns |
135 |
| - ------- |
136 |
| - DataFrame |
137 |
| - Reconstructed DataFrame with imputed categorical variables. |
138 |
| -
|
139 |
| - """ |
140 |
| - df_reconstructed = df_original.copy() |
141 |
| - start_idx = 0 |
142 |
| - for col in df_original.columns: |
143 |
| - if ( |
144 |
| - df_original[col].dtype.name == "category" |
145 |
| - or df_original[col].dtype == object |
146 |
| - ): # noqa: E501 |
147 |
| - categories = df_original[col].cat.categories.tolist() |
148 |
| - if "__MISSING__" in categories: |
149 |
| - missing_cat_index = categories.index("__MISSING__") |
150 |
| - else: |
151 |
| - missing_cat_index = None |
152 |
| - num_categories = len(categories) |
153 |
| - sub_tab = tab_disj.iloc[:, start_idx : start_idx + num_categories] |
154 |
| - if missing_cat_index is not None: |
155 |
| - sub_tab.iloc[:, missing_cat_index] = -np.inf |
156 |
| - # Find the category with the maximum value for each row |
157 |
| - max_indices = sub_tab.values.argmax(axis=1) |
158 |
| - df_reconstructed[col] = [categories[idx] for idx in max_indices] |
159 |
| - # Replace '__MISSING__' back to NaN |
160 |
| - df_reconstructed[col].replace("__MISSING__", np.nan, inplace=True) |
161 |
| - start_idx += num_categories |
162 |
| - else: |
163 |
| - # For numeric variables, keep as is |
164 |
| - start_idx += 1 # Increment start_idx by 1 for numeric columns |
165 |
| - return df_reconstructed |
166 |
| - |
167 |
| - |
168 |
| -def svdtriplet(X, row_w=None, ncp=np.inf): |
169 |
| - """Perform weighted SVD on matrix X with row weights. |
170 |
| -
|
171 |
| - Parameters |
172 |
| - ---------- |
173 |
| - X : ndarray |
174 |
| - Data matrix of shape (n_samples, n_features). |
175 |
| - row_w : array-like, optional |
176 |
| - Row weights. If None, uniform weights are assumed. Default is None. |
177 |
| - ncp : int |
178 |
| - Number of principal components to retain. Default is infinity. |
179 |
| -
|
180 |
| - Returns |
181 |
| - ------- |
182 |
| - s : ndarray |
183 |
| - Singular values. |
184 |
| - U : ndarray |
185 |
| - Left singular vectors. |
186 |
| - V : ndarray |
187 |
| - Right singular vectors. |
188 |
| -
|
189 |
| - """ |
190 |
| - if not isinstance(X, np.ndarray): |
191 |
| - X = np.array(X, dtype=float) |
192 |
| - else: |
193 |
| - X = X.astype(float) |
194 |
| - if row_w is None: |
195 |
| - row_w = np.ones(X.shape[0]) / X.shape[0] |
196 |
| - else: |
197 |
| - row_w = np.array(row_w, dtype=float) |
198 |
| - row_w /= row_w.sum() |
199 |
| - ncp = int(min(ncp, X.shape[0] - 1, X.shape[1])) |
200 |
| - # Apply weights to rows |
201 |
| - X_weighted = X * np.sqrt(row_w[:, None]) |
202 |
| - # Perform SVD |
203 |
| - U, s, Vt = np.linalg.svd(X_weighted, full_matrices=False) |
204 |
| - V = Vt.T |
205 |
| - U = U[:, :ncp] |
206 |
| - V = V[:, :ncp] |
207 |
| - s = s[:ncp] |
208 |
| - # Adjust signs to ensure consistency |
209 |
| - mult = np.sign(np.sum(V, axis=0)) |
210 |
| - mult[mult == 0] = 1 |
211 |
| - U *= mult |
212 |
| - V *= mult |
213 |
| - # Rescale U by the square root of row weights |
214 |
| - U /= np.sqrt(row_w[:, None]) |
215 |
| - return s, U, V |
| 4 | +from qolmat.utils.algebra import svdtriplet |
| 5 | +from qolmat.utils.utils import ( |
| 6 | + find_category, |
| 7 | + moy_p, |
| 8 | + tab_disjonctif_NA, |
| 9 | + tab_disjonctif_prop, |
| 10 | +) |
216 | 11 |
|
217 | 12 |
|
218 | 13 | def imputeMCA(
|
|
0 commit comments