From 8b28c6b8124067807c37fa5507628b942382c408 Mon Sep 17 00:00:00 2001
From: leolamien <90797794+leolamien@users.noreply.github.com>
Date: Wed, 17 Sep 2025 20:21:28 +0200
Subject: [PATCH 1/3] Add k-means++

---
 machine_learning/k_means_plus_plus.py | 438 ++++++++++++++++++++++++++
 1 file changed, 438 insertions(+)
 create mode 100644 machine_learning/k_means_plus_plus.py

diff --git a/machine_learning/k_means_plus_plus.py b/machine_learning/k_means_plus_plus.py
new file mode 100644
index 000000000000..5743ddd2a71c
--- /dev/null
+++ b/machine_learning/k_means_plus_plus.py
@@ -0,0 +1,438 @@
+"""README, Author -
+original Author:  Anurag Kumar(mailto:anuragkumarak95@gmail.com)
+Modifications by: Leonce Lamien
+Date: [17 september 2025]
+
+References:
+D. Arthur and S. Vassilvitskii, "k-means++: The Advantages of Careful Seeding",
+
+Requirements:
+  - sklearn
+  - numpy
+  - matplotlib
+Python:
+  - 3.5
+Inputs:
+  - X , a 2D numpy array of features.
+  - k , number of clusters to create.
+  - initial_centroids , initial centroid values generated by utility function(mentioned
+    in usage).
+  - maxiter , maximum number of iterations to process.
+  - heterogeneity , empty list that will be filled with heterogeneity values if passed
+    to kmeans func.
+Usage:
+  1. define 'k' value, 'X' features array and 'heterogeneity' empty list
+  2. create initial_centroids,
+        initial_centroids = get_initial_centroids_kmeans_plus_plus(
+            X,
+            k,
+            seed=2 # seed value for initial centroid generation,
+                   # None for randomness(default=None)
+            )
+  3. find centroids and clusters using kmeans function.
+        centroids, cluster_assignment = kmeans(
+            X,
+            k,
+            initial_centroids,
+            maxiter=400,
+            record_heterogeneity=heterogeneity,
+            verbose=True # whether to print logs in console or not.(default=False)
+            )
+  4. Plot the loss function and heterogeneity values for every iteration saved in
+     heterogeneity list.
+        plot_heterogeneity(
+            heterogeneity,
+            k
+        )
+  5. Plot the labeled 3D data points with centroids.
+        plot_kmeans(
+            X,
+            centroids,
+            cluster_assignment
+        )
+  6. Transfers Dataframe into excel format it must have feature called
+      'Clust' with k means clustering numbers in it.
+"""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from sklearn.metrics import pairwise_distances
+
+warnings.filterwarnings("ignore")
+
+
+# =============================================================================
+# K-MEANS++ INITIALIZATION
+# =============================================================================
+
+
+def get_initial_centroids_kmeans_plus_plus(data, k, seed=None):
+    """K-means++ initialization for better cluster seeding.
+    This implementation follows the K-means++ algorithm which chooses
+    initial centroids using probabilistic D² weighting to maximize
+    centroid dispersion and improve convergence.
+    """
+    # useful for obtaining consistent results
+    rng = np.random.default_rng(seed)
+    n = data.shape[0]  # number of data points
+
+    # Pick randomly the first centroid from range [0, N).
+    first_centroid_idx = rng.integers(0, n)
+    centroids = np.zeros((k, data.shape[1]))
+    centroids[0] = data[first_centroid_idx]
+
+    for i in range(1, k):
+        probabilities = []
+        # Compute distances to nearest centroid
+        distances = centroid_pairwise_dist(data, centroids[:i])
+        min_distances = np.min(distances, axis=1)
+        squared_distances = min_distances**2
+
+        # Convert to probabilities
+        probabilities = squared_distances / np.sum(squared_distances)
+
+        # Choose next centroid
+        next_centroid_idx = rng.choice(n, p=probabilities)
+        centroids[i] = data[next_centroid_idx]
+
+    return centroids
+
+
+# =============================================================================
+# K-MEANS INITIALIZATION (Naive version)
+# =============================================================================
+
+
+def get_initial_centroids(data, k, seed=None):
+    """Randomly choose k data points as initial centroids"""
+    # useful for obtaining consistent results
+    rng = np.random.default_rng(seed)
+    n = data.shape[0]  # number of data points
+
+    # Pick K indices from range [0, N).
+    rand_indices = rng.integers(0, n, k)
+
+    # Keep centroids as dense format, as many entries will be nonzero due to averaging.
+    # As long as at least one document in a cluster contains a word,
+    # it will carry a nonzero weight in the TF-IDF vector of the centroid.
+    centroids = data[rand_indices, :]
+
+    return centroids
+
+
+def centroid_pairwise_dist(x, centroids):
+    return pairwise_distances(x, centroids, metric="euclidean")
+
+
+def assign_clusters(data, centroids):
+    # Compute distances between each data point and the set of centroids:
+    # Fill in the blank (RHS only)
+    distances_from_centroids = centroid_pairwise_dist(data, centroids)
+
+    # Compute cluster assignments for each data point:
+    # Fill in the blank (RHS only)
+    cluster_assignment = np.argmin(distances_from_centroids, axis=1)
+
+    return cluster_assignment
+
+
+def revise_centroids(data, k, cluster_assignment):
+    new_centroids = []
+    for i in range(k):
+        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
+        member_data_points = data[cluster_assignment == i]
+        # Compute the mean of the data points. Fill in the blank (RHS only)
+        centroid = member_data_points.mean(axis=0)
+        new_centroids.append(centroid)
+    new_centroids = np.array(new_centroids)
+
+    return new_centroids
+
+
+def compute_heterogeneity(data, k, centroids, cluster_assignment):
+    heterogeneity = 0.0
+    for i in range(k):
+        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
+        member_data_points = data[cluster_assignment == i, :]
+
+        if member_data_points.shape[0] > 0:  # check if i-th cluster is non-empty
+            # Compute distances from centroid to data points (RHS only)
+            distances = pairwise_distances(
+                member_data_points, [centroids[i]], metric="euclidean"
+            )
+            squared_distances = distances**2
+            heterogeneity += np.sum(squared_distances)
+
+    return heterogeneity
+
+
+def plot_heterogeneity(heterogeneity__kmeans_plus_plus, heterogeneity__kmeans_naive, k):
+    plt.figure(figsize=(7, 4))
+    plt.plot(heterogeneity__kmeans_plus_plus, linewidth=4)
+    plt.plot(heterogeneity__kmeans_naive, linewidth=4)
+    plt.xlabel("# Iterations")
+    plt.ylabel("Heterogeneity")
+    plt.title(f"Heterogeneity of clustering over time for K-Means, K={k:d}")
+    plt.legend(["K-Means++", "Naive K-Means"], loc="upper right")
+    plt.show()
+
+
+def plot_kmeans(data, centroids, cluster_assignment, title):
+    ax = plt.axes(projection="3d")
+    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis")
+    ax.scatter(
+        centroids[:, 0], centroids[:, 1], centroids[:, 2], c="red", s=100, marker="x"
+    )
+    ax.set_xlabel("X")
+    ax.set_ylabel("Y")
+    ax.set_zlabel("Z")
+    ax.set_title(title)
+    plt.show()
+
+
+def kmeans(
+    data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False
+):
+    """Runs k-means on given data and initial set of centroids.
+    maxiter: maximum number of iterations to run.(default=500)
+    record_heterogeneity: (optional) a list, to store the history of heterogeneity
+                          as function of iterations
+                          if None, do not store the history.
+    verbose: if True, print how many data points changed their cluster labels in
+                          each iteration"""
+    centroids = initial_centroids[:]
+    prev_cluster_assignment = None
+
+    for itr in range(maxiter):
+        if verbose:
+            print(itr, end="")
+
+        # 1. Make cluster assignments using nearest centroids
+        cluster_assignment = assign_clusters(data, centroids)
+
+        # 2. Compute a new centroid for each of the k clusters, averaging all data
+        #    points assigned to that cluster.
+        centroids = revise_centroids(data, k, cluster_assignment)
+
+        # Check for convergence: if none of the assignments changed, stop
+        if (
+            prev_cluster_assignment is not None
+            and (prev_cluster_assignment == cluster_assignment).all()
+        ):
+            break
+
+        # Print number of new assignments
+        if prev_cluster_assignment is not None:
+            num_changed = np.sum(prev_cluster_assignment != cluster_assignment)
+            if verbose:
+                print(
+                    f"    {num_changed:5d} elements changed their cluster assignment."
+                )
+
+        # Record heterogeneity convergence metric
+        if record_heterogeneity is not None:
+            # YOUR CODE HERE
+            score = compute_heterogeneity(data, k, centroids, cluster_assignment)
+            record_heterogeneity.append(score)
+
+        prev_cluster_assignment = cluster_assignment[:]
+
+    return centroids, cluster_assignment
+
+
+# Mock test below
+if True:  # change to true to run this test case.
+    from sklearn import datasets as ds
+
+    dataset = ds.load_iris()
+    k = 3
+
+    heterogeneity_kmeans_naive: list[float] = []
+    heterogeneity_kmeans_plus_plus: list[float] = []
+
+    initial_centroids_kmeans_plus_plus = get_initial_centroids_kmeans_plus_plus(
+        dataset["data"], k, seed=2
+    )
+    initial_centroids_kmeans_naive = get_initial_centroids(dataset["data"], k, seed=2)
+
+    centroids_kmeans_plus_plus, cluster_assignment_kmeans_plus_plus = kmeans(
+        dataset["data"],
+        k,
+        initial_centroids_kmeans_plus_plus,
+        maxiter=400,
+        record_heterogeneity=heterogeneity_kmeans_plus_plus,
+        verbose=True,
+    )
+
+    centroids_kmeans_naive, cluster_assignment_kmeans_naive = kmeans(
+        dataset["data"],
+        k,
+        initial_centroids_kmeans_naive,
+        maxiter=400,
+        record_heterogeneity=heterogeneity_kmeans_naive,
+        verbose=True,
+    )
+    plot_heterogeneity(heterogeneity_kmeans_plus_plus, heterogeneity_kmeans_naive, k)
+    plot_kmeans(
+        dataset["data"],
+        centroids_kmeans_naive,
+        cluster_assignment_kmeans_naive,
+        "3D naive K-Means Clustering Visualization",
+    )
+    plot_kmeans(
+        dataset["data"],
+        centroids_kmeans_plus_plus,
+        cluster_assignment_kmeans_plus_plus,
+        "3D K-Means++ Clustering Visualization",
+    )
+
+
+def report_generator(
+    predicted: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
+) -> pd.DataFrame:
+    """
+    Generate a clustering report given these two arguments:
+        predicted - dataframe with predicted cluster column
+        fill_missing_report - dictionary of rules on how we are going to fill in missing
+        values for final generated report (not included in modelling);
+    >>> predicted = pd.DataFrame()
+    >>> predicted['numbers'] = [1, 2, 3]
+    >>> predicted['col1'] = [0.5, 2.5, 4.5]
+    >>> predicted['col2'] = [100, 200, 300]
+    >>> predicted['col3'] = [10, 20, 30]
+    >>> predicted['Cluster'] = [1, 1, 2]
+    >>> report_generator(predicted, ['col1', 'col2'], 0)
+               Features               Type   Mark           1           2
+    0    # of Customers        ClusterSize  False    2.000000    1.000000
+    1    % of Customers  ClusterProportion  False    0.666667    0.333333
+    2              col1    mean_with_zeros   True    1.500000    4.500000
+    3              col2    mean_with_zeros   True  150.000000  300.000000
+    4           numbers    mean_with_zeros  False    1.500000    3.000000
+    ..              ...                ...    ...         ...         ...
+    99            dummy                 5%  False    1.000000    1.000000
+    100           dummy                95%  False    1.000000    1.000000
+    101           dummy              stdev  False    0.000000         NaN
+    102           dummy               mode  False    1.000000    1.000000
+    103           dummy             median  False    1.000000    1.000000
+    <BLANKLINE>
+    [104 rows x 5 columns]
+    """
+    # Fill missing values with given rules
+    if fill_missing_report:
+        predicted = predicted.fillna(value=fill_missing_report)
+    predicted["dummy"] = 1
+    numeric_cols = predicted.select_dtypes(np.number).columns
+    report = (
+        predicted.groupby(["Cluster"])[  # construct report dataframe
+            numeric_cols
+        ]  # group by cluster number
+        .agg(
+            [
+                ("sum", "sum"),
+                ("mean_with_zeros", lambda x: np.mean(np.nan_to_num(x))),
+                ("mean_without_zeros", lambda x: x.replace(0, np.nan).mean()),
+                (
+                    "mean_25-75",
+                    lambda x: np.mean(
+                        np.nan_to_num(
+                            sorted(x)[
+                                round(len(x) * 25 / 100) : round(len(x) * 75 / 100)
+                            ]
+                        )
+                    ),
+                ),
+                ("mean_with_na", "mean"),
+                ("min", lambda x: x.min()),
+                ("5%", lambda x: x.quantile(0.05)),
+                ("25%", lambda x: x.quantile(0.25)),
+                ("50%", lambda x: x.quantile(0.50)),
+                ("75%", lambda x: x.quantile(0.75)),
+                ("95%", lambda x: x.quantile(0.95)),
+                ("max", lambda x: x.max()),
+                ("count", lambda x: x.count()),
+                ("stdev", lambda x: x.std()),
+                ("mode", lambda x: x.mode()[0]),
+                ("median", lambda x: x.median()),
+                ("# > 0", lambda x: (x > 0).sum()),
+            ]
+        )
+        .T.reset_index()
+        .rename(index=str, columns={"level_0": "Features", "level_1": "Type"})
+    )  # rename columns
+    # calculate the size of cluster(count of clientID's)
+    # avoid SettingWithCopyWarning
+    clustersize = report[
+        (report["Features"] == "dummy") & (report["Type"] == "count")
+    ].copy()
+    # rename created predicted cluster to match report column names
+    clustersize.Type = "ClusterSize"
+    clustersize.Features = "# of Customers"
+    # calculating the proportion of cluster
+    clusterproportion = pd.DataFrame(
+        clustersize.iloc[:, 2:].to_numpy() / clustersize.iloc[:, 2:].to_numpy().sum()
+    )
+    # rename created predicted cluster to match report column names
+    clusterproportion["Type"] = "% of Customers"
+    clusterproportion["Features"] = "ClusterProportion"
+    cols = clusterproportion.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    clusterproportion = clusterproportion[cols]  # rearrange columns to match report
+    clusterproportion.columns = report.columns
+    # generating dataframe with count of nan values
+    a = pd.DataFrame(
+        abs(
+            report[report["Type"] == "count"].iloc[:, 2:].to_numpy()
+            - clustersize.iloc[:, 2:].to_numpy()
+        )
+    )
+    a["Features"] = 0
+    a["Type"] = "# of nan"
+    # filling values in order to match report
+    a.Features = report[report["Type"] == "count"].Features.tolist()
+    cols = a.columns.tolist()
+    cols = cols[-2:] + cols[:-2]
+    a = a[cols]  # rearrange columns to match report
+    a.columns = report.columns  # rename columns to match report
+    # drop count values except for cluster size
+    report = report.drop(report[report.Type == "count"].index)
+    # concat report with cluster size and nan values
+    report = pd.concat([report, a, clustersize, clusterproportion], axis=0)
+    report["Mark"] = report["Features"].isin(clustering_variables)
+    cols = report.columns.tolist()
+    cols = cols[0:2] + cols[-1:] + cols[2:-1]
+    report = report[cols]
+    sorter1 = {
+        "ClusterSize": 9,
+        "ClusterProportion": 8,
+        "mean_with_zeros": 7,
+        "mean_with_na": 6,
+        "max": 5,
+        "50%": 4,
+        "min": 3,
+        "25%": 2,
+        "75%": 1,
+        "# of nan": 0,
+        "# > 0": -1,
+        "sum_with_na": -2,
+    }
+    report = (
+        report.assign(
+            Sorter1=lambda x: x.Type.map(sorter1),
+            Sorter2=lambda x: list(reversed(range(len(x)))),
+        )
+        .sort_values(["Sorter1", "Mark", "Sorter2"], ascending=False)
+        .drop(["Sorter1", "Sorter2"], axis=1)
+    )
+    report.columns.name = ""
+    report = report.reset_index()
+    report = report.drop(columns=["index"])
+    return report
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()

From 8709ed8b41c8efa669251f8423af92e547949ed1 Mon Sep 17 00:00:00 2001
From: leolamien <leolamien@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:22:18 +0000
Subject: [PATCH 2/3] updating DIRECTORY.md

---
 DIRECTORY.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DIRECTORY.md b/DIRECTORY.md
index 36acb3b97f1e..035c43411809 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -605,6 +605,7 @@
   * [Gradient Boosting Classifier](machine_learning/gradient_boosting_classifier.py)
   * [Gradient Descent](machine_learning/gradient_descent.py)
   * [K Means Clust](machine_learning/k_means_clust.py)
+  * [K Means Plus Plus](machine_learning/k_means_plus_plus.py)
   * [K Nearest Neighbours](machine_learning/k_nearest_neighbours.py)
   * [Linear Discriminant Analysis](machine_learning/linear_discriminant_analysis.py)
   * [Linear Regression](machine_learning/linear_regression.py)

From 5eb32c79e514cf0cf53a3dcdd531e5b0e6d00c7d Mon Sep 17 00:00:00 2001
From: leolamien <90797794+leolamien@users.noreply.github.com>
Date: Wed, 17 Sep 2025 20:56:39 +0200
Subject: [PATCH 3/3] Set mock test to false

---
 machine_learning/k_means_plus_plus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/machine_learning/k_means_plus_plus.py b/machine_learning/k_means_plus_plus.py
index 5743ddd2a71c..e50a26950569 100644
--- a/machine_learning/k_means_plus_plus.py
+++ b/machine_learning/k_means_plus_plus.py
@@ -244,7 +244,7 @@ def kmeans(
 
 
 # Mock test below
-if True:  # change to true to run this test case.
+if False:  # change to true to run this test case.
     from sklearn import datasets as ds
 
     dataset = ds.load_iris()