From 8b28c6b8124067807c37fa5507628b942382c408 Mon Sep 17 00:00:00 2001 From: leolamien <90797794+leolamien@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:21:28 +0200 Subject: [PATCH 1/3] Add k-means++ --- machine_learning/k_means_plus_plus.py | 438 ++++++++++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 machine_learning/k_means_plus_plus.py diff --git a/machine_learning/k_means_plus_plus.py b/machine_learning/k_means_plus_plus.py new file mode 100644 index 000000000000..5743ddd2a71c --- /dev/null +++ b/machine_learning/k_means_plus_plus.py @@ -0,0 +1,438 @@ +"""README, Author - +original Author: Anurag Kumar(mailto:anuragkumarak95@gmail.com) +Modifications by: Leonce Lamien +Date: [17 september 2025] + +References: +D. Arthur and S. Vassilvitskii, "k-means++: The Advantages of Careful Seeding", + +Requirements: + - sklearn + - numpy + - matplotlib +Python: + - 3.5 +Inputs: + - X , a 2D numpy array of features. + - k , number of clusters to create. + - initial_centroids , initial centroid values generated by utility function(mentioned + in usage). + - maxiter , maximum number of iterations to process. + - heterogeneity , empty list that will be filled with heterogeneity values if passed + to kmeans func. +Usage: + 1. define 'k' value, 'X' features array and 'heterogeneity' empty list + 2. create initial_centroids, + initial_centroids = get_initial_centroids_kmeans_plus_plus( + X, + k, + seed=2 # seed value for initial centroid generation, + # None for randomness(default=None) + ) + 3. find centroids and clusters using kmeans function. + centroids, cluster_assignment = kmeans( + X, + k, + initial_centroids, + maxiter=400, + record_heterogeneity=heterogeneity, + verbose=True # whether to print logs in console or not.(default=False) + ) + 4. Plot the loss function and heterogeneity values for every iteration saved in + heterogeneity list. + plot_heterogeneity( + heterogeneity, + k + ) + 5. Plot the labeled 3D data points with centroids. + plot_kmeans( + X, + centroids, + cluster_assignment + ) + 6. Transfers Dataframe into excel format it must have feature called + 'Clust' with k means clustering numbers in it. +""" + +import warnings + +import numpy as np +import pandas as pd +from matplotlib import pyplot as plt +from sklearn.metrics import pairwise_distances + +warnings.filterwarnings("ignore") + + +# ============================================================================= +# K-MEANS++ INITIALIZATION +# ============================================================================= + + +def get_initial_centroids_kmeans_plus_plus(data, k, seed=None): + """K-means++ initialization for better cluster seeding. + This implementation follows the K-means++ algorithm which chooses + initial centroids using probabilistic D² weighting to maximize + centroid dispersion and improve convergence. + """ + # useful for obtaining consistent results + rng = np.random.default_rng(seed) + n = data.shape[0] # number of data points + + # Pick randomly the first centroid from range [0, N). + first_centroid_idx = rng.integers(0, n) + centroids = np.zeros((k, data.shape[1])) + centroids[0] = data[first_centroid_idx] + + for i in range(1, k): + probabilities = [] + # Compute distances to nearest centroid + distances = centroid_pairwise_dist(data, centroids[:i]) + min_distances = np.min(distances, axis=1) + squared_distances = min_distances**2 + + # Convert to probabilities + probabilities = squared_distances / np.sum(squared_distances) + + # Choose next centroid + next_centroid_idx = rng.choice(n, p=probabilities) + centroids[i] = data[next_centroid_idx] + + return centroids + + +# ============================================================================= +# K-MEANS INITIALIZATION (Naive version) +# ============================================================================= + + +def get_initial_centroids(data, k, seed=None): + """Randomly choose k data points as initial centroids""" + # useful for obtaining consistent results + rng = np.random.default_rng(seed) + n = data.shape[0] # number of data points + + # Pick K indices from range [0, N). + rand_indices = rng.integers(0, n, k) + + # Keep centroids as dense format, as many entries will be nonzero due to averaging. + # As long as at least one document in a cluster contains a word, + # it will carry a nonzero weight in the TF-IDF vector of the centroid. + centroids = data[rand_indices, :] + + return centroids + + +def centroid_pairwise_dist(x, centroids): + return pairwise_distances(x, centroids, metric="euclidean") + + +def assign_clusters(data, centroids): + # Compute distances between each data point and the set of centroids: + # Fill in the blank (RHS only) + distances_from_centroids = centroid_pairwise_dist(data, centroids) + + # Compute cluster assignments for each data point: + # Fill in the blank (RHS only) + cluster_assignment = np.argmin(distances_from_centroids, axis=1) + + return cluster_assignment + + +def revise_centroids(data, k, cluster_assignment): + new_centroids = [] + for i in range(k): + # Select all data points that belong to cluster i. Fill in the blank (RHS only) + member_data_points = data[cluster_assignment == i] + # Compute the mean of the data points. Fill in the blank (RHS only) + centroid = member_data_points.mean(axis=0) + new_centroids.append(centroid) + new_centroids = np.array(new_centroids) + + return new_centroids + + +def compute_heterogeneity(data, k, centroids, cluster_assignment): + heterogeneity = 0.0 + for i in range(k): + # Select all data points that belong to cluster i. Fill in the blank (RHS only) + member_data_points = data[cluster_assignment == i, :] + + if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty + # Compute distances from centroid to data points (RHS only) + distances = pairwise_distances( + member_data_points, [centroids[i]], metric="euclidean" + ) + squared_distances = distances**2 + heterogeneity += np.sum(squared_distances) + + return heterogeneity + + +def plot_heterogeneity(heterogeneity__kmeans_plus_plus, heterogeneity__kmeans_naive, k): + plt.figure(figsize=(7, 4)) + plt.plot(heterogeneity__kmeans_plus_plus, linewidth=4) + plt.plot(heterogeneity__kmeans_naive, linewidth=4) + plt.xlabel("# Iterations") + plt.ylabel("Heterogeneity") + plt.title(f"Heterogeneity of clustering over time for K-Means, K={k:d}") + plt.legend(["K-Means++", "Naive K-Means"], loc="upper right") + plt.show() + + +def plot_kmeans(data, centroids, cluster_assignment, title): + ax = plt.axes(projection="3d") + ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis") + ax.scatter( + centroids[:, 0], centroids[:, 1], centroids[:, 2], c="red", s=100, marker="x" + ) + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title(title) + plt.show() + + +def kmeans( + data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False +): + """Runs k-means on given data and initial set of centroids. + maxiter: maximum number of iterations to run.(default=500) + record_heterogeneity: (optional) a list, to store the history of heterogeneity + as function of iterations + if None, do not store the history. + verbose: if True, print how many data points changed their cluster labels in + each iteration""" + centroids = initial_centroids[:] + prev_cluster_assignment = None + + for itr in range(maxiter): + if verbose: + print(itr, end="") + + # 1. Make cluster assignments using nearest centroids + cluster_assignment = assign_clusters(data, centroids) + + # 2. Compute a new centroid for each of the k clusters, averaging all data + # points assigned to that cluster. + centroids = revise_centroids(data, k, cluster_assignment) + + # Check for convergence: if none of the assignments changed, stop + if ( + prev_cluster_assignment is not None + and (prev_cluster_assignment == cluster_assignment).all() + ): + break + + # Print number of new assignments + if prev_cluster_assignment is not None: + num_changed = np.sum(prev_cluster_assignment != cluster_assignment) + if verbose: + print( + f" {num_changed:5d} elements changed their cluster assignment." + ) + + # Record heterogeneity convergence metric + if record_heterogeneity is not None: + # YOUR CODE HERE + score = compute_heterogeneity(data, k, centroids, cluster_assignment) + record_heterogeneity.append(score) + + prev_cluster_assignment = cluster_assignment[:] + + return centroids, cluster_assignment + + +# Mock test below +if True: # change to true to run this test case. + from sklearn import datasets as ds + + dataset = ds.load_iris() + k = 3 + + heterogeneity_kmeans_naive: list[float] = [] + heterogeneity_kmeans_plus_plus: list[float] = [] + + initial_centroids_kmeans_plus_plus = get_initial_centroids_kmeans_plus_plus( + dataset["data"], k, seed=2 + ) + initial_centroids_kmeans_naive = get_initial_centroids(dataset["data"], k, seed=2) + + centroids_kmeans_plus_plus, cluster_assignment_kmeans_plus_plus = kmeans( + dataset["data"], + k, + initial_centroids_kmeans_plus_plus, + maxiter=400, + record_heterogeneity=heterogeneity_kmeans_plus_plus, + verbose=True, + ) + + centroids_kmeans_naive, cluster_assignment_kmeans_naive = kmeans( + dataset["data"], + k, + initial_centroids_kmeans_naive, + maxiter=400, + record_heterogeneity=heterogeneity_kmeans_naive, + verbose=True, + ) + plot_heterogeneity(heterogeneity_kmeans_plus_plus, heterogeneity_kmeans_naive, k) + plot_kmeans( + dataset["data"], + centroids_kmeans_naive, + cluster_assignment_kmeans_naive, + "3D naive K-Means Clustering Visualization", + ) + plot_kmeans( + dataset["data"], + centroids_kmeans_plus_plus, + cluster_assignment_kmeans_plus_plus, + "3D K-Means++ Clustering Visualization", + ) + + +def report_generator( + predicted: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None +) -> pd.DataFrame: + """ + Generate a clustering report given these two arguments: + predicted - dataframe with predicted cluster column + fill_missing_report - dictionary of rules on how we are going to fill in missing + values for final generated report (not included in modelling); + >>> predicted = pd.DataFrame() + >>> predicted['numbers'] = [1, 2, 3] + >>> predicted['col1'] = [0.5, 2.5, 4.5] + >>> predicted['col2'] = [100, 200, 300] + >>> predicted['col3'] = [10, 20, 30] + >>> predicted['Cluster'] = [1, 1, 2] + >>> report_generator(predicted, ['col1', 'col2'], 0) + Features Type Mark 1 2 + 0 # of Customers ClusterSize False 2.000000 1.000000 + 1 % of Customers ClusterProportion False 0.666667 0.333333 + 2 col1 mean_with_zeros True 1.500000 4.500000 + 3 col2 mean_with_zeros True 150.000000 300.000000 + 4 numbers mean_with_zeros False 1.500000 3.000000 + .. ... ... ... ... ... + 99 dummy 5% False 1.000000 1.000000 + 100 dummy 95% False 1.000000 1.000000 + 101 dummy stdev False 0.000000 NaN + 102 dummy mode False 1.000000 1.000000 + 103 dummy median False 1.000000 1.000000 + + [104 rows x 5 columns] + """ + # Fill missing values with given rules + if fill_missing_report: + predicted = predicted.fillna(value=fill_missing_report) + predicted["dummy"] = 1 + numeric_cols = predicted.select_dtypes(np.number).columns + report = ( + predicted.groupby(["Cluster"])[ # construct report dataframe + numeric_cols + ] # group by cluster number + .agg( + [ + ("sum", "sum"), + ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))), + ("mean_without_zeros", lambda x: x.replace(0, np.nan).mean()), + ( + "mean_25-75", + lambda x: np.mean( + np.nan_to_num( + sorted(x)[ + round(len(x) * 25 / 100) : round(len(x) * 75 / 100) + ] + ) + ), + ), + ("mean_with_na", "mean"), + ("min", lambda x: x.min()), + ("5%", lambda x: x.quantile(0.05)), + ("25%", lambda x: x.quantile(0.25)), + ("50%", lambda x: x.quantile(0.50)), + ("75%", lambda x: x.quantile(0.75)), + ("95%", lambda x: x.quantile(0.95)), + ("max", lambda x: x.max()), + ("count", lambda x: x.count()), + ("stdev", lambda x: x.std()), + ("mode", lambda x: x.mode()[0]), + ("median", lambda x: x.median()), + ("# > 0", lambda x: (x > 0).sum()), + ] + ) + .T.reset_index() + .rename(index=str, columns={"level_0": "Features", "level_1": "Type"}) + ) # rename columns + # calculate the size of cluster(count of clientID's) + # avoid SettingWithCopyWarning + clustersize = report[ + (report["Features"] == "dummy") & (report["Type"] == "count") + ].copy() + # rename created predicted cluster to match report column names + clustersize.Type = "ClusterSize" + clustersize.Features = "# of Customers" + # calculating the proportion of cluster + clusterproportion = pd.DataFrame( + clustersize.iloc[:, 2:].to_numpy() / clustersize.iloc[:, 2:].to_numpy().sum() + ) + # rename created predicted cluster to match report column names + clusterproportion["Type"] = "% of Customers" + clusterproportion["Features"] = "ClusterProportion" + cols = clusterproportion.columns.tolist() + cols = cols[-2:] + cols[:-2] + clusterproportion = clusterproportion[cols] # rearrange columns to match report + clusterproportion.columns = report.columns + # generating dataframe with count of nan values + a = pd.DataFrame( + abs( + report[report["Type"] == "count"].iloc[:, 2:].to_numpy() + - clustersize.iloc[:, 2:].to_numpy() + ) + ) + a["Features"] = 0 + a["Type"] = "# of nan" + # filling values in order to match report + a.Features = report[report["Type"] == "count"].Features.tolist() + cols = a.columns.tolist() + cols = cols[-2:] + cols[:-2] + a = a[cols] # rearrange columns to match report + a.columns = report.columns # rename columns to match report + # drop count values except for cluster size + report = report.drop(report[report.Type == "count"].index) + # concat report with cluster size and nan values + report = pd.concat([report, a, clustersize, clusterproportion], axis=0) + report["Mark"] = report["Features"].isin(clustering_variables) + cols = report.columns.tolist() + cols = cols[0:2] + cols[-1:] + cols[2:-1] + report = report[cols] + sorter1 = { + "ClusterSize": 9, + "ClusterProportion": 8, + "mean_with_zeros": 7, + "mean_with_na": 6, + "max": 5, + "50%": 4, + "min": 3, + "25%": 2, + "75%": 1, + "# of nan": 0, + "# > 0": -1, + "sum_with_na": -2, + } + report = ( + report.assign( + Sorter1=lambda x: x.Type.map(sorter1), + Sorter2=lambda x: list(reversed(range(len(x)))), + ) + .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False) + .drop(["Sorter1", "Sorter2"], axis=1) + ) + report.columns.name = "" + report = report.reset_index() + report = report.drop(columns=["index"]) + return report + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From 8709ed8b41c8efa669251f8423af92e547949ed1 Mon Sep 17 00:00:00 2001 From: leolamien Date: Wed, 17 Sep 2025 18:22:18 +0000 Subject: [PATCH 2/3] updating DIRECTORY.md --- DIRECTORY.md | 1 + 1 file changed, 1 insertion(+) diff --git a/DIRECTORY.md b/DIRECTORY.md index 36acb3b97f1e..035c43411809 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -605,6 +605,7 @@ * [Gradient Boosting Classifier](machine_learning/gradient_boosting_classifier.py) * [Gradient Descent](machine_learning/gradient_descent.py) * [K Means Clust](machine_learning/k_means_clust.py) + * [K Means Plus Plus](machine_learning/k_means_plus_plus.py) * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py) * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py) * [Linear Regression](machine_learning/linear_regression.py) From 5eb32c79e514cf0cf53a3dcdd531e5b0e6d00c7d Mon Sep 17 00:00:00 2001 From: leolamien <90797794+leolamien@users.noreply.github.com> Date: Wed, 17 Sep 2025 20:56:39 +0200 Subject: [PATCH 3/3] Set mock test to false --- machine_learning/k_means_plus_plus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/k_means_plus_plus.py b/machine_learning/k_means_plus_plus.py index 5743ddd2a71c..e50a26950569 100644 --- a/machine_learning/k_means_plus_plus.py +++ b/machine_learning/k_means_plus_plus.py @@ -244,7 +244,7 @@ def kmeans( # Mock test below -if True: # change to true to run this test case. +if False: # change to true to run this test case. from sklearn import datasets as ds dataset = ds.load_iris()