From d4799ee2a38e0f475d67b43134965df7df9d626a Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Wed, 25 Mar 2026 16:49:02 +0100 Subject: [PATCH 1/6] add reports again --- util/dataframe_ops.py | 1 + util/fof_utils.py | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py index 7720ab8..e4f577f 100644 --- a/util/dataframe_ops.py +++ b/util/dataframe_ops.py @@ -480,6 +480,7 @@ def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules, log_file_name) ref_df[list(cols_without_rules)].to_xarray(), cur_df[list(cols_without_rules)].to_xarray(), detailed_logger, + key ) if t != e: return True diff --git a/util/fof_utils.py b/util/fof_utils.py index f9e544d..fabd6a3 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -169,7 +169,7 @@ def write_different_size_log(var, size1, size2, detailed_logger): ) -def compare_var_and_attr_ds(ds1, ds2, detailed_logger): +def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): """ Variable by variable and attribute by attribute, comparison of the two datasets. @@ -178,23 +178,23 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger): total_all, equal_all = 0, 0 list_to_skip = ["source", "i_body", "l_body", "veri_data"] - for var in sorted(set(ds1.data_vars).union(ds2.data_vars)): - if var in ds1.data_vars and var in ds2.data_vars and var not in list_to_skip: + for var in set(ds1.data_vars).intersection(ds2.data_vars): + if key == "reports" and var not in list_to_skip: - total, equal = process_var(ds1, ds2, var, detailed_logger) + total, equal = process_var(ds1, ds2, var, detailed_logger, prova="vars") total_all += total equal_all += equal - if var in ds1.attrs and var in ds2.attrs and var not in list_to_skip: + if key == "observations" and var not in list_to_skip: - total, equal = process_var(ds1, ds2, var, detailed_logger) + total, equal = process_var(ds1, ds2, var, detailed_logger, prova="attrs") total_all += total equal_all += equal return total_all, equal_all -def process_var(ds1, ds2, var, detailed_logger): +def process_var(ds1, ds2, var, detailed_logger, prova=None): """ This function first checks whether two arrays have the same size. If they do, their values are compared. @@ -203,8 +203,13 @@ def process_var(ds1, ds2, var, detailed_logger): number of matching elements. """ - arr1 = fill_nans_for_float32(ds1[var].values) - arr2 = fill_nans_for_float32(ds2[var].values) + if prova == "attrs": + arr1 = np.array(ds1[var], dtype=object) + arr2 = np.array(ds2[var], dtype=object) + if prova == "vars": + arr1 = fill_nans_for_float32(ds1[var].values) + arr2 = fill_nans_for_float32(ds2[var].values) + if arr1.size == arr2.size: t, e, diff = compare_arrays(arr1, arr2, var) if diff.size != 0: From 255dd54d631ac46d07a3444bd1694455782df985 Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Mon, 30 Mar 2026 11:31:30 +0200 Subject: [PATCH 2/6] restore old version --- util/fof_utils.py | 62 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/util/fof_utils.py b/util/fof_utils.py index fabd6a3..706c7e6 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -181,19 +181,69 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): for var in set(ds1.data_vars).intersection(ds2.data_vars): if key == "reports" and var not in list_to_skip: - total, equal = process_var(ds1, ds2, var, detailed_logger, prova="vars") - total_all += total - equal_all += equal + arr1 = fill_nans_for_float32(ds1[var].values) + arr2 = fill_nans_for_float32(ds2[var].values) + + if arr1.size == arr2.size: + t, e, diff = compare_arrays(arr1, arr2, var) + + else: + t, e = max(arr1.size, arr2.size), 0 + write_different_size(var, arr1.size, arr2.size, detailed_logger) + + #total, equal = process_var(ds1, ds2, var, detailed_logger, prova="vars") + total_all += t + equal_all += e if key == "observations" and var not in list_to_skip: - total, equal = process_var(ds1, ds2, var, detailed_logger, prova="attrs") - total_all += total - equal_all += equal + arr1 = np.array(ds1.attrs[var], dtype=object) + arr2 = np.array(ds2.attrs[var], dtype=object) + if arr1.size == arr2.size: + t, e, diff = compare_arrays(arr1, arr2, var) + + else: + t, e = max(arr1.size, arr2.size), 0 + write_different_size_log(var, arr1.size, arr2.size, detailed_logger) + + total_all += t + equal_all += e return total_all, equal_all +def compare_arrays(arr1, arr2, var_name): + """ + Comparison of two arrays containing the values of the same variable. + If not the same, it tells you in percentage terms how different they are. + """ + total = arr1.size + + if np.array_equal(arr1, arr2): + equal = total + diff = np.array([]) + + elif ( + np.issubdtype(arr1.dtype, np.number) + and np.issubdtype(arr2.dtype, np.number) + and np.array_equal(arr1, arr2, equal_nan=True) + ): + equal = total + diff = np.array([]) + + else: + mask_equal = arr1 == arr2 + equal = mask_equal.sum() + percent = (equal / total) * 100 + print( + f"Differences in '{var_name}': {percent:.2f}% equal. " + f"{total} total entries for this variable" + ) + diff_idx = np.where(~mask_equal.ravel())[0] + diff = diff_idx + + return total, equal, diff + def process_var(ds1, ds2, var, detailed_logger, prova=None): """ This function first checks whether two arrays have the same size. From 9ebe8d1593921646a2c656ac5e2df51059c30559 Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Mon, 30 Mar 2026 13:38:49 +0200 Subject: [PATCH 3/6] invert repo obs --- util/fof_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/fof_utils.py b/util/fof_utils.py index 706c7e6..0f097fe 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -179,7 +179,7 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): list_to_skip = ["source", "i_body", "l_body", "veri_data"] for var in set(ds1.data_vars).intersection(ds2.data_vars): - if key == "reports" and var not in list_to_skip: + if key == "observations" and var not in list_to_skip: arr1 = fill_nans_for_float32(ds1[var].values) arr2 = fill_nans_for_float32(ds2[var].values) @@ -195,7 +195,7 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): total_all += t equal_all += e - if key == "observations" and var not in list_to_skip: + if key == "reports" and var not in list_to_skip: arr1 = np.array(ds1.attrs[var], dtype=object) arr2 = np.array(ds2.attrs[var], dtype=object) From 2037afab62023ee59990b92d4c8648351d00e44f Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Mon, 30 Mar 2026 13:47:34 +0200 Subject: [PATCH 4/6] rr --- util/fof_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/util/fof_utils.py b/util/fof_utils.py index 0f097fe..faae563 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -179,7 +179,7 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): list_to_skip = ["source", "i_body", "l_body", "veri_data"] for var in set(ds1.data_vars).intersection(ds2.data_vars): - if key == "observations" and var not in list_to_skip: + if key == "reports" and var not in list_to_skip: arr1 = fill_nans_for_float32(ds1[var].values) arr2 = fill_nans_for_float32(ds2[var].values) @@ -195,10 +195,10 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): total_all += t equal_all += e - if key == "reports" and var not in list_to_skip: + if key == "observations" and var not in list_to_skip: - arr1 = np.array(ds1.attrs[var], dtype=object) - arr2 = np.array(ds2.attrs[var], dtype=object) + arr1 = np.array(ds1[var], dtype=object) + arr2 = np.array(ds2[var], dtype=object) if arr1.size == arr2.size: t, e, diff = compare_arrays(arr1, arr2, var) From d66f51838a4d0b57a5afe39b69e51499a574377c Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Tue, 31 Mar 2026 10:30:32 +0200 Subject: [PATCH 5/6] f --- util/dataframe_ops.py | 8 +++++-- util/fof_utils.py | 49 ++++++++++--------------------------------- 2 files changed, 17 insertions(+), 40 deletions(-) diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py index e4f577f..ec433b3 100644 --- a/util/dataframe_ops.py +++ b/util/dataframe_ops.py @@ -82,10 +82,13 @@ def parse_probtest_fof(path): and df_obs respectively. """ ds = xr.open_dataset(path) + ds_report, ds_obs = split_feedback_dataset(ds) + df_report, df_obs = ( pd.DataFrame(d.to_dataframe().reset_index()) for d in (ds_report, ds_obs) ) + return df_report, df_obs @@ -475,12 +478,13 @@ def check_multiple_solutions_from_dict(dict_ref, dict_cur, rules, log_file_name) cols_with_rules = [col for col in common_cols if col in rules_dict] cols_without_rules = [col for col in common_cols if col not in rules_dict] + + if cols_without_rules: t, e = compare_var_and_attr_ds( ref_df[list(cols_without_rules)].to_xarray(), cur_df[list(cols_without_rules)].to_xarray(), - detailed_logger, - key + detailed_logger ) if t != e: return True diff --git a/util/fof_utils.py b/util/fof_utils.py index faae563..db51c54 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -50,6 +50,7 @@ def split_feedback_dataset(ds): sort_keys_reports = ["lat", "lon", "statid", "time_nomi", "codetype"] ds_report_sorted = ds_reports.sortby(sort_keys_reports) + print(ds_report_sorted["r_check"].values) lbody = ds["l_body"].values @@ -119,7 +120,8 @@ def clean_value(x): alignment when printing the value. """ if isinstance(x, bytes): - return x.decode("utf-8", errors="replace").rstrip(" '") + return x.decode().rstrip(" '") + # return x.decode("utf-8", errors="replace").rstrip(" '") return str(x).rstrip(" '") @@ -169,7 +171,7 @@ def write_different_size_log(var, size1, size2, detailed_logger): ) -def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): +def compare_var_and_attr_ds(ds1, ds2, detailed_logger): """ Variable by variable and attribute by attribute, comparison of the two datasets. @@ -178,36 +180,12 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger, key): total_all, equal_all = 0, 0 list_to_skip = ["source", "i_body", "l_body", "veri_data"] - for var in set(ds1.data_vars).intersection(ds2.data_vars): - if key == "reports" and var not in list_to_skip: + for var in set(ds1.data_vars).union(ds2.data_vars): + if var in ds1.data_vars and var in ds2.data_vars and var not in list_to_skip: - arr1 = fill_nans_for_float32(ds1[var].values) - arr2 = fill_nans_for_float32(ds2[var].values) - - if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) - - else: - t, e = max(arr1.size, arr2.size), 0 - write_different_size(var, arr1.size, arr2.size, detailed_logger) - - #total, equal = process_var(ds1, ds2, var, detailed_logger, prova="vars") - total_all += t - equal_all += e - - if key == "observations" and var not in list_to_skip: - - arr1 = np.array(ds1[var], dtype=object) - arr2 = np.array(ds2[var], dtype=object) - if arr1.size == arr2.size: - t, e, diff = compare_arrays(arr1, arr2, var) - - else: - t, e = max(arr1.size, arr2.size), 0 - write_different_size_log(var, arr1.size, arr2.size, detailed_logger) - - total_all += t - equal_all += e + total, equal = process_var(ds1, ds2, var, detailed_logger) + total_all += total + equal_all += equal return total_all, equal_all @@ -252,13 +230,8 @@ def process_var(ds1, ds2, var, detailed_logger, prova=None): The function outputs the total number of elements and the number of matching elements. """ - - if prova == "attrs": - arr1 = np.array(ds1[var], dtype=object) - arr2 = np.array(ds2[var], dtype=object) - if prova == "vars": - arr1 = fill_nans_for_float32(ds1[var].values) - arr2 = fill_nans_for_float32(ds2[var].values) + arr1 = fill_nans_for_float32(ds1[var].values) + arr2 = fill_nans_for_float32(ds2[var].values) if arr1.size == arr2.size: t, e, diff = compare_arrays(arr1, arr2, var) From 5a24388e497b685e8e14b81fe9e6e3b61e647933 Mon Sep 17 00:00:00 2001 From: Chiara Ghielmini Date: Thu, 2 Apr 2026 10:32:34 +0200 Subject: [PATCH 6/6] allow 64 --- util/dataframe_ops.py | 13 +++++++++---- util/fof_utils.py | 12 +++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/util/dataframe_ops.py b/util/dataframe_ops.py index ec433b3..dc08dfb 100644 --- a/util/dataframe_ops.py +++ b/util/dataframe_ops.py @@ -47,12 +47,17 @@ def compute_rel_diff_dataframe(df1, df2): return out -def compute_division(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame: +def compute_division(df1: pd.DataFrame, df2) -> pd.DataFrame: # avoid division by 0 and put nan instead - out = df1 / df2.replace({0: np.nan}) - # put 0 if numerator is 0 as well + if np.isscalar(df2): + if df2 == 0: + return df1 * np.nan + out = df1 / df2 + else: + out = df1 / df2.replace({0: np.nan}) + out[df1 == 0] = 0 - return out + return outt def parse_probtest_stats(path, index_col=None): diff --git a/util/fof_utils.py b/util/fof_utils.py index db51c54..42c0e90 100644 --- a/util/fof_utils.py +++ b/util/fof_utils.py @@ -109,9 +109,12 @@ def fill_nans_for_float32(arr): """ To make sure nan values are recognised. """ - if arr.dtype == np.float32 and np.isnan(arr).any(): - return np.where(np.isnan(arr), -999999, arr) - return arr + if not np.issubdtype(arr.dtype, np.floating): + return arr + + arr = arr.astype(np.float64, copy=False) + + return np.where(np.isnan(arr), -999999.0, arr) def clean_value(x): @@ -182,7 +185,6 @@ def compare_var_and_attr_ds(ds1, ds2, detailed_logger): for var in set(ds1.data_vars).union(ds2.data_vars): if var in ds1.data_vars and var in ds2.data_vars and var not in list_to_skip: - total, equal = process_var(ds1, ds2, var, detailed_logger) total_all += total equal_all += equal @@ -222,7 +224,7 @@ def compare_arrays(arr1, arr2, var_name): return total, equal, diff -def process_var(ds1, ds2, var, detailed_logger, prova=None): +def process_var(ds1, ds2, var, detailed_logger): """ This function first checks whether two arrays have the same size. If they do, their values are compared.