Improved code error handling, robustness and documentation

kalebruscitti · kalebruscitti · commit 59090fefb270 · 2024-07-02T19:48:46.000Z
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ mapperplayground.ipynb
 data/.ipynb_checkpoints/*
 temporal_grapher/__pycache__/*
 temporal_grapher/.ipynb_checkpoints/*
+tests.ipynb
diff --git a/DemoV0.2.1.ipynb b/DemoV0.2.1.ipynb
diff --git a/DemoV0.2.ipynb b/DemoV0.2.ipynb
diff --git a/temporal_grapher/temporal_mapper.py b/temporal_grapher/temporal_mapper.py
@@ -5,14 +5,16 @@
 from weighted_clustering import *
 from tqdm import tqdm, trange
 from sklearn.metrics import pairwise_distances
+from sklearn.preprocessing import StandardScaler
+from scipy.sparse import issparse
 
 '''TemporalGraph class 
 minimal usage example: 
 
     # load from your data file:
-    data : (n_dim, N_data) array
-    time : (N_data,) array
-    semantic_dist : (N_data,) array
+    data : (n_dim, N_data) array-like
+    time : (N_data,) array-like
+    semantic_dist : (N_data,) array-like
     # choose an sklearn clusterer:
     clusterer = HDBSCAN()
 
@@ -34,58 +36,70 @@ class TemporalGraph():
 
     Attributes
     ----------
-    time : ndarray
-        time array (1 dim)
-    data : ndarray
-        data array (n dim)
-    clusterer : sklearn clusterer
-        the clusterer to use for the slice-wise clustering, must accept sample_weights
-    N_checkpoints : int
-        number of time-points at which to cluster
-    checkpoints : arraylike
-        array of time-points at which to cluster
-    show_outliers : bool
-        If true, include unclustered points in the graph
-    slice_method : str
-        One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
-        generates N_checkpoints such that there are equal amounts of data between the points. 
-    rate_sensitivity : float
-        A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
-        means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1, 
-        then the rate parameter is taken log2. 
-    kernel : function
-        A function with signiture f(t0, t, density, binwidth, epsilon=0.01, params=None).
-        Two options are included in weighted_clustering.py, `weighted_clustering.square` and 
-        `weighted_clustering.gaussian`.
-    kernel_parameters : tuple or None,
-        Passed to `kernel` as params kwarg.
-    precomputed_distances : ndarray
-        an (n_data, n_data) array of pairwise distances between points. If None then it will
-        be computed using `sklearn.metrics.pairwise_distances`.
-    verbose : bool
-        Does what you expect.
-    
-
     G : networkx.classes.Digraph(Graph)
         The temporal graph itself.
+    density : ndarray
+        The f-density \rho for each data point.
+        
     Methods
     -------
     build(ydata=None):
         Perform all operations necessary to construct the graph.
     """
+    
     def __init__(
         self, time, data, clusterer, 
-        N_checkpoints=None,
-        clusters=None,
-        checkpoints=None,
-        show_outliers=False,
+        N_checkpoints = None,
+        resolution = 10,
+        overlap = 0.5,
+        clusters = None,
+        checkpoints = None,
+        show_outliers = False,
         slice_method = 'time',
         rate_sensitivity = 1,
         kernel = gaussian,
         kernel_params = None,
-        precomputed_distances = None,
-        verbose=False,
+        verbose = False,
     ):
+        """
+        Parameters 
+        ----------
+        time : ndarray
+            time array (1 dim)
+        data : ndarray
+            data array (n dim)
+        clusterer : sklearn clusterer
+            the clusterer to use for the slice-wise clustering, must accept sample_weights
+        N_checkpoints : int
+            number of time-points at which to cluster
+        checkpoints : arraylike
+            array of time-points at which to cluster
+        overlap : float
+            A float in (0,1) which specifies the `g` parameter (see README)
+        resolution: float
+            Determines the distance around each point which we use as a neighbourhood for 
+            determining the f-rate. If you get a warning about isolated points, you should
+            increase this parameter. If you plot the density and it is not very smooth
+            you can increase this parameter.
+        show_outliers : bool
+            If true, include unclustered points in the graph
+        slice_method : str
+            One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
+            generates N_checkpoints such that there are equal amounts of data between the points. 
+        rate_sensitivity : float
+            A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
+            means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1, 
+            then the rate parameter is taken log2. 
+        kernel : function
+            A function with signature f(t0, t, density, binwidth, epsilon=0.01, params=None).
+            Two options are included in weighted_clustering.py, `weighted_clustering.square` and 
+            `weighted_clustering.gaussian`.
+        kernel_parameters : tuple or None,
+            Passed to `kernel` as params kwarg.
+        verbose : bool
+            Does what you expect.
+        
+        """
         if np.size(time) != np.shape(data)[0]:
             raise AttributeError("Number of datapoints",
                                  np.shape(data)[0],
@@ -98,18 +112,20 @@ def __init__(
         if len(data.shape) == 1:
             data=data.reshape(-1,1)
         self.n_components = data.shape[1]
-        self.data = data
+        if issparse(data):
+            self.scaler = StandardScaler(copy=False, with_mean=False)
+        else:
+            self.scaler = StandardScaler(copy=False)
+        self.data = self.scaler.fit_transform(data)
         self.checkpoints = checkpoints
         if slice_method in ['time','data']:
             self.slice_method = slice_method
         else:
             raise AttributeError("Accepted slice_method is 'time' or 'data'.")
         if checkpoints is not None:
             self.N_checkpoints = np.size(checkpoints)
-            if (np.size(slices) == N_slices):
-                self.N_checkpoints = N_checkpoints
-            else: 
-                raise AttributeError("If you pass checkpoints and N_checkpoints, then len(checkpoints) must equal N_checkpoints.")
+            if not (self.N_checkpoints == N_checkpoints):
+                raise AttributeError("Given checkpoints and N_checkpoints, len(checkpoints) must equal N_checkpoints.")
         else:
             if N_checkpoints is not None:
                 self.N_checkpoints = N_checkpoints
@@ -118,7 +134,8 @@ def __init__(
 
         self.clusterer = clusterer
         self.clusters = clusters
-        self.densities = None
+        self.g = overlap
+        self.density = None
         self.sensitivity = rate_sensitivity
         self.kernel = kernel
         self.kernel_params = kernel_params
@@ -128,9 +145,10 @@ def __init__(
         self.verbose=verbose
         self.disable = not verbose # tqdm
         self.show_outliers = False
-        self.distances = precomputed_distances
-        if precomputed_distances is None:
-            self.distances = pairwise_distances(data)
+        self.resolution = resolution
+        if self.verbose:
+            print("Computing pairwise distances...")
+        self.distance = pairwise_distances(data)
 
     def _compute_checkpoints(self):
         if self.slice_method == 'data':
@@ -142,7 +160,7 @@ def _compute_checkpoints(self):
         self.checkpoints = checkpoints
         return checkpoints
 
-    def _compute_densities(self):
+    def _compute_density(self):
         if self.checkpoints is None:
             self._compute_checkpoints()
         if self.verbose:
@@ -154,33 +172,33 @@ def _compute_densities(self):
         rates = compute_point_rates(
             self.data,
             self.time,
-            self.distances,
+            self.distance,
+            self.resolution*data_width,
             sensitivity=self.sensitivity,
-            width=data_width/10,
         )
         iso_idx = (rates==np.inf)
         nisolated = np.size((iso_idx).nonzero())
         if nisolated != 0:
-            print(f'Warning: You have {nisolated} isolated points. If this is a small number, its probably fine.')
-        densities = 1/rates
-        densities = sigmoid(densities, np.median(densities))
+            print(f'Warning: You have {nisolated} isolated points. If this is a small number, its probably fine. Otherwise, increase the resolution parameter.')
+        density = 1/rates
+        density = std_sigmoid(density)
         if self.sensitivity == -1:
-            self.densities = 1/(1-np.log2(densities))
+            self.density = 1/(1-np.log2(density))
         else:
-            self.densities = densities**self.sensitivity
-        self.densities[iso_idx] = 0
-        return self.densities
+            self.density = density**self.sensitivity
+        self.density[iso_idx] = np.amin(density[~iso_idx])
+        return self.density
 
     def _cluster(self):
-        if self.densities is None:
-            self._compute_densities()
+        if self.density is None:
+            self._compute_density()
         if self.verbose:
             print("Clusting at each time slice...")
         clusters, weights = weighted_clusters(
             self.data,
             self.time,
             self.checkpoints,
-            self.densities,
+            self.density,
             self.clusterer,
             self.kernel,
             self.kernel_params,
@@ -279,7 +297,7 @@ def build_adj_matrix(self):
                 adj_mat[l][k] += self.kernel(
                     time_centers[i],
                     self.time[j],
-                    self.densities[j],
+                    self.density[j],
                     bin_width[i],
                     params=self.kernel_params
                 )
diff --git a/temporal_grapher/utilities_.py b/temporal_grapher/utilities_.py
@@ -4,12 +4,15 @@
 from vectorizers.transformers import InformationWeightTransformer
 from vectorizers import NgramVectorizer
 
-def sigmoid(x, mu):
-    return 1/(1+np.exp(-(x-mu)))
+def std_sigmoid(x):
+    mu = np.mean(x)
+    std = np.std(x)
+    transform=(x-mu)/(std)
+    return 1/(1+np.exp(-1*transform))
 
 def cluster_avg_1D(cluster_data, y_data):
-    #  Average out the y_data in each cluster,
-    # to use as y-axis positions for the graph visualization
+    ''' Average out the y_data in each cluster,
+     to use as y-axis positions for the graph visualization '''
     clusters = np.unique(cluster_data)
     avg_arr = np.zeros(np.shape(clusters))
     i = 0
@@ -27,7 +30,7 @@ def cluster_avg_1D(cluster_data, y_data):
     return avg_arr
 
 def cluster_most_common(cluster_data, y_data):
-    # Get the most common y_data val in each cluster
+    ''' Get the most common y_data val in each cluster '''
     clusters = np.unique(cluster_data)
     most_arr = np.zeros(np.shape(clusters), dtype=int)
     i = 0
@@ -44,6 +47,7 @@ def cluster_most_common(cluster_data, y_data):
     return most_arr
 
 def graph_to_holoviews(G,dataset_func=None):
+    ''' Take TemporalGraph.G and output the required HoloViews objects for a modified Sankey diagram.''' 
     nxNodes = G.nodes()
     nodes = nxNodes  # lol
     cnt = 0
@@ -98,7 +102,9 @@ def compute_cluster_yaxis(clusters, semantic_dist, func=cluster_avg_1D):
 
     return y_data
 
-def generate_keyword_labels(word_bags, TG, newline=True):
+def generate_keyword_labels(word_bags, TG, n_words=3, sep=' '):
+    """ Using a bag of words corresponding to each data point, get highly informative
+    keywords for each cluster """
     ngram_vectorizer = NgramVectorizer()
     ngram_vectors = ngram_vectorizer.fit_transform(word_bags)
     ## Building cluster labels (crudely)
@@ -123,11 +129,13 @@ def generate_keyword_labels(word_bags, TG, newline=True):
         cluster_keywords = []
         for cl_vector in weighted_vectors:
             cl_vector = np.squeeze(cl_vector)
-            first_, second_, third_ = np.argsort(cl_vector)[-3:]
-            w1 = ngram_vectorizer._inverse_token_dictionary_[first_]
-            w2 = ngram_vectorizer._inverse_token_dictionary_[second_]
-            w3 = ngram_vectorizer._inverse_token_dictionary_[third_]
-            row = np.array([w1,w2,w3])
+            highest = np.argsort(cl_vector)[-n_words:]
+            row = []
+            for k in highest:
+                word = ngram_vectorizer._inverse_token_dictionary_[k]
+                row.append(word)
+            #w2 = ngram_vectorizer._inverse_token_dictionary_[second_]
+            row = np.array(row)
             cluster_keywords.append(row)
         keywords.append(cluster_keywords)
         t_attrs = nx.get_node_attributes(TG.G, 'slice_no')
@@ -137,11 +145,13 @@ def generate_keyword_labels(word_bags, TG, newline=True):
         t_idx = t_attrs[node]
         cl_idx = cl_attrs[node]
         words = keywords[t_idx][cl_idx]
-        if newline:
-            label_attrs[node] = words[0]+'\n'+words[1]+'\n'+words[2] 
-        else:
-            label_attrs[node] = words[0]+' '+words[1]+' '+words[2] 
-    
+        s = ''
+        for word in words[:-1]:
+            s += word+sep
+        s += word[-1] 
+        label_attrs[node] = s
+
+    print("Complete.        ")
     nx.set_node_attributes(TG.G, label_attrs, 'label')
     return TG
 
diff --git a/temporal_grapher/weighted_clustering.py b/temporal_grapher/weighted_clustering.py
@@ -20,18 +20,13 @@ def square(t0, t, density, binwidth, epsilon = 0.1, params=(1,)):
     return out
     
 def window(distance, width=1):
-    # default to 10 because UMAP 
     if np.abs(distance) < width:
         return (1/2)*(1+np.cos(np.pi*distance/width))
     else:
         return 0
 
-def compute_point_rates(data, time, distances, width=1, sensitivity=1):
-    data_width = np.mean(
-        [np.amax(data[:,k])-np.amin(data[:,k])
-         for k in range(data.shape[1])]     
-    )
-    d_max = 100*data_width/np.size(time)
+def compute_point_rates(data, time, distances, width, sensitivity=1):
+    d_max = width/np.size(time)
     lambdas = np.zeros(np.size(time))
     for i,d in enumerate(distances):
         t0 = time[i]
@@ -52,16 +47,19 @@ def compute_point_rates(data, time, distances, width=1, sensitivity=1):
             lambdas[i] = 0
         else:
             lambdas[i] = np.average(deltas, weights=time_weights)
+    iso_idx = (lambdas == 0).nonzero()
     # apply the window:
     smoothed_lambdas = np.zeros(np.size(time))
     for j, d in enumerate(distances):
         val = 0
         norm = 0
-        idx=(d<=10*d_max).nonzero()[0]
+        idx=(d<=25*d_max).nonzero()[0]
         for i in idx:
-            val += window(d[i], width)*lambdas[i]
-            norm +=  window(d[i], width)
+            val += window(d[i], 5*d_max)*lambdas[i]
+            norm +=  window(d[i], 5*d_max)
         smoothed_lambdas[j] = val/norm
+
+    #smoothed_lambdas = lambdas
     iso_idx = (smoothed_lambdas == 0)
     rates = smoothed_lambdas 
     rates[iso_idx] = np.inf