Skip to content

Commit 59090fe

Browse files
committed
Improved code error handling, robustness and documentation
1 parent 05fa083 commit 59090fe

File tree

6 files changed

+1350
-1333
lines changed

6 files changed

+1350
-1333
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ mapperplayground.ipynb
44
data/.ipynb_checkpoints/*
55
temporal_grapher/__pycache__/*
66
temporal_grapher/.ipynb_checkpoints/*
7+
tests.ipynb

DemoV0.2.1.ipynb

Lines changed: 1233 additions & 0 deletions
Large diffs are not rendered by default.

DemoV0.2.ipynb

Lines changed: 0 additions & 1243 deletions
This file was deleted.

temporal_grapher/temporal_mapper.py

Lines changed: 82 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,16 @@
55
from weighted_clustering import *
66
from tqdm import tqdm, trange
77
from sklearn.metrics import pairwise_distances
8+
from sklearn.preprocessing import StandardScaler
9+
from scipy.sparse import issparse
810

911
'''TemporalGraph class
1012
minimal usage example:
1113
1214
# load from your data file:
13-
data : (n_dim, N_data) array
14-
time : (N_data,) array
15-
semantic_dist : (N_data,) array
15+
data : (n_dim, N_data) array-like
16+
time : (N_data,) array-like
17+
semantic_dist : (N_data,) array-like
1618
# choose an sklearn clusterer:
1719
clusterer = HDBSCAN()
1820
@@ -34,58 +36,70 @@ class TemporalGraph():
3436
3537
Attributes
3638
----------
37-
time : ndarray
38-
time array (1 dim)
39-
data : ndarray
40-
data array (n dim)
41-
clusterer : sklearn clusterer
42-
the clusterer to use for the slice-wise clustering, must accept sample_weights
43-
N_checkpoints : int
44-
number of time-points at which to cluster
45-
checkpoints : arraylike
46-
array of time-points at which to cluster
47-
show_outliers : bool
48-
If true, include unclustered points in the graph
49-
slice_method : str
50-
One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
51-
generates N_checkpoints such that there are equal amounts of data between the points.
52-
rate_sensitivity : float
53-
A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
54-
means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1,
55-
then the rate parameter is taken log2.
56-
kernel : function
57-
A function with signiture f(t0, t, density, binwidth, epsilon=0.01, params=None).
58-
Two options are included in weighted_clustering.py, `weighted_clustering.square` and
59-
`weighted_clustering.gaussian`.
60-
kernel_parameters : tuple or None,
61-
Passed to `kernel` as params kwarg.
62-
precomputed_distances : ndarray
63-
an (n_data, n_data) array of pairwise distances between points. If None then it will
64-
be computed using `sklearn.metrics.pairwise_distances`.
65-
verbose : bool
66-
Does what you expect.
67-
68-
6939
G : networkx.classes.Digraph(Graph)
7040
The temporal graph itself.
41+
density : ndarray
42+
The f-density \rho for each data point.
43+
7144
Methods
7245
-------
7346
build(ydata=None):
7447
Perform all operations necessary to construct the graph.
7548
"""
49+
7650
def __init__(
7751
self, time, data, clusterer,
78-
N_checkpoints=None,
79-
clusters=None,
80-
checkpoints=None,
81-
show_outliers=False,
52+
N_checkpoints = None,
53+
resolution = 10,
54+
overlap = 0.5,
55+
clusters = None,
56+
checkpoints = None,
57+
show_outliers = False,
8258
slice_method = 'time',
8359
rate_sensitivity = 1,
8460
kernel = gaussian,
8561
kernel_params = None,
86-
precomputed_distances = None,
87-
verbose=False,
62+
verbose = False,
8863
):
64+
"""
65+
Parameters
66+
----------
67+
time : ndarray
68+
time array (1 dim)
69+
data : ndarray
70+
data array (n dim)
71+
clusterer : sklearn clusterer
72+
the clusterer to use for the slice-wise clustering, must accept sample_weights
73+
N_checkpoints : int
74+
number of time-points at which to cluster
75+
checkpoints : arraylike
76+
array of time-points at which to cluster
77+
overlap : float
78+
A float in (0,1) which specifies the `g` parameter (see README)
79+
resolution: float
80+
Determines the distance around each point which we use as a neighbourhood for
81+
determining the f-rate. If you get a warning about isolated points, you should
82+
increase this parameter. If you plot the density and it is not very smooth
83+
you can increase this parameter.
84+
show_outliers : bool
85+
If true, include unclustered points in the graph
86+
slice_method : str
87+
One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
88+
generates N_checkpoints such that there are equal amounts of data between the points.
89+
rate_sensitivity : float
90+
A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
91+
means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1,
92+
then the rate parameter is taken log2.
93+
kernel : function
94+
A function with signature f(t0, t, density, binwidth, epsilon=0.01, params=None).
95+
Two options are included in weighted_clustering.py, `weighted_clustering.square` and
96+
`weighted_clustering.gaussian`.
97+
kernel_parameters : tuple or None,
98+
Passed to `kernel` as params kwarg.
99+
verbose : bool
100+
Does what you expect.
101+
102+
"""
89103
if np.size(time) != np.shape(data)[0]:
90104
raise AttributeError("Number of datapoints",
91105
np.shape(data)[0],
@@ -98,18 +112,20 @@ def __init__(
98112
if len(data.shape) == 1:
99113
data=data.reshape(-1,1)
100114
self.n_components = data.shape[1]
101-
self.data = data
115+
if issparse(data):
116+
self.scaler = StandardScaler(copy=False, with_mean=False)
117+
else:
118+
self.scaler = StandardScaler(copy=False)
119+
self.data = self.scaler.fit_transform(data)
102120
self.checkpoints = checkpoints
103121
if slice_method in ['time','data']:
104122
self.slice_method = slice_method
105123
else:
106124
raise AttributeError("Accepted slice_method is 'time' or 'data'.")
107125
if checkpoints is not None:
108126
self.N_checkpoints = np.size(checkpoints)
109-
if (np.size(slices) == N_slices):
110-
self.N_checkpoints = N_checkpoints
111-
else:
112-
raise AttributeError("If you pass checkpoints and N_checkpoints, then len(checkpoints) must equal N_checkpoints.")
127+
if not (self.N_checkpoints == N_checkpoints):
128+
raise AttributeError("Given checkpoints and N_checkpoints, len(checkpoints) must equal N_checkpoints.")
113129
else:
114130
if N_checkpoints is not None:
115131
self.N_checkpoints = N_checkpoints
@@ -118,7 +134,8 @@ def __init__(
118134

119135
self.clusterer = clusterer
120136
self.clusters = clusters
121-
self.densities = None
137+
self.g = overlap
138+
self.density = None
122139
self.sensitivity = rate_sensitivity
123140
self.kernel = kernel
124141
self.kernel_params = kernel_params
@@ -128,9 +145,10 @@ def __init__(
128145
self.verbose=verbose
129146
self.disable = not verbose # tqdm
130147
self.show_outliers = False
131-
self.distances = precomputed_distances
132-
if precomputed_distances is None:
133-
self.distances = pairwise_distances(data)
148+
self.resolution = resolution
149+
if self.verbose:
150+
print("Computing pairwise distances...")
151+
self.distance = pairwise_distances(data)
134152

135153
def _compute_checkpoints(self):
136154
if self.slice_method == 'data':
@@ -142,7 +160,7 @@ def _compute_checkpoints(self):
142160
self.checkpoints = checkpoints
143161
return checkpoints
144162

145-
def _compute_densities(self):
163+
def _compute_density(self):
146164
if self.checkpoints is None:
147165
self._compute_checkpoints()
148166
if self.verbose:
@@ -154,33 +172,33 @@ def _compute_densities(self):
154172
rates = compute_point_rates(
155173
self.data,
156174
self.time,
157-
self.distances,
175+
self.distance,
176+
self.resolution*data_width,
158177
sensitivity=self.sensitivity,
159-
width=data_width/10,
160178
)
161179
iso_idx = (rates==np.inf)
162180
nisolated = np.size((iso_idx).nonzero())
163181
if nisolated != 0:
164-
print(f'Warning: You have {nisolated} isolated points. If this is a small number, its probably fine.')
165-
densities = 1/rates
166-
densities = sigmoid(densities, np.median(densities))
182+
print(f'Warning: You have {nisolated} isolated points. If this is a small number, its probably fine. Otherwise, increase the resolution parameter.')
183+
density = 1/rates
184+
density = std_sigmoid(density)
167185
if self.sensitivity == -1:
168-
self.densities = 1/(1-np.log2(densities))
186+
self.density = 1/(1-np.log2(density))
169187
else:
170-
self.densities = densities**self.sensitivity
171-
self.densities[iso_idx] = 0
172-
return self.densities
188+
self.density = density**self.sensitivity
189+
self.density[iso_idx] = np.amin(density[~iso_idx])
190+
return self.density
173191

174192
def _cluster(self):
175-
if self.densities is None:
176-
self._compute_densities()
193+
if self.density is None:
194+
self._compute_density()
177195
if self.verbose:
178196
print("Clusting at each time slice...")
179197
clusters, weights = weighted_clusters(
180198
self.data,
181199
self.time,
182200
self.checkpoints,
183-
self.densities,
201+
self.density,
184202
self.clusterer,
185203
self.kernel,
186204
self.kernel_params,
@@ -279,7 +297,7 @@ def build_adj_matrix(self):
279297
adj_mat[l][k] += self.kernel(
280298
time_centers[i],
281299
self.time[j],
282-
self.densities[j],
300+
self.density[j],
283301
bin_width[i],
284302
params=self.kernel_params
285303
)

temporal_grapher/utilities_.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
from vectorizers.transformers import InformationWeightTransformer
55
from vectorizers import NgramVectorizer
66

7-
def sigmoid(x, mu):
8-
return 1/(1+np.exp(-(x-mu)))
7+
def std_sigmoid(x):
8+
mu = np.mean(x)
9+
std = np.std(x)
10+
transform=(x-mu)/(std)
11+
return 1/(1+np.exp(-1*transform))
912

1013
def cluster_avg_1D(cluster_data, y_data):
11-
# Average out the y_data in each cluster,
12-
# to use as y-axis positions for the graph visualization
14+
''' Average out the y_data in each cluster,
15+
to use as y-axis positions for the graph visualization '''
1316
clusters = np.unique(cluster_data)
1417
avg_arr = np.zeros(np.shape(clusters))
1518
i = 0
@@ -27,7 +30,7 @@ def cluster_avg_1D(cluster_data, y_data):
2730
return avg_arr
2831

2932
def cluster_most_common(cluster_data, y_data):
30-
# Get the most common y_data val in each cluster
33+
''' Get the most common y_data val in each cluster '''
3134
clusters = np.unique(cluster_data)
3235
most_arr = np.zeros(np.shape(clusters), dtype=int)
3336
i = 0
@@ -44,6 +47,7 @@ def cluster_most_common(cluster_data, y_data):
4447
return most_arr
4548

4649
def graph_to_holoviews(G,dataset_func=None):
50+
''' Take TemporalGraph.G and output the required HoloViews objects for a modified Sankey diagram.'''
4751
nxNodes = G.nodes()
4852
nodes = nxNodes # lol
4953
cnt = 0
@@ -98,7 +102,9 @@ def compute_cluster_yaxis(clusters, semantic_dist, func=cluster_avg_1D):
98102

99103
return y_data
100104

101-
def generate_keyword_labels(word_bags, TG, newline=True):
105+
def generate_keyword_labels(word_bags, TG, n_words=3, sep=' '):
106+
""" Using a bag of words corresponding to each data point, get highly informative
107+
keywords for each cluster """
102108
ngram_vectorizer = NgramVectorizer()
103109
ngram_vectors = ngram_vectorizer.fit_transform(word_bags)
104110
## Building cluster labels (crudely)
@@ -123,11 +129,13 @@ def generate_keyword_labels(word_bags, TG, newline=True):
123129
cluster_keywords = []
124130
for cl_vector in weighted_vectors:
125131
cl_vector = np.squeeze(cl_vector)
126-
first_, second_, third_ = np.argsort(cl_vector)[-3:]
127-
w1 = ngram_vectorizer._inverse_token_dictionary_[first_]
128-
w2 = ngram_vectorizer._inverse_token_dictionary_[second_]
129-
w3 = ngram_vectorizer._inverse_token_dictionary_[third_]
130-
row = np.array([w1,w2,w3])
132+
highest = np.argsort(cl_vector)[-n_words:]
133+
row = []
134+
for k in highest:
135+
word = ngram_vectorizer._inverse_token_dictionary_[k]
136+
row.append(word)
137+
#w2 = ngram_vectorizer._inverse_token_dictionary_[second_]
138+
row = np.array(row)
131139
cluster_keywords.append(row)
132140
keywords.append(cluster_keywords)
133141
t_attrs = nx.get_node_attributes(TG.G, 'slice_no')
@@ -137,11 +145,13 @@ def generate_keyword_labels(word_bags, TG, newline=True):
137145
t_idx = t_attrs[node]
138146
cl_idx = cl_attrs[node]
139147
words = keywords[t_idx][cl_idx]
140-
if newline:
141-
label_attrs[node] = words[0]+'\n'+words[1]+'\n'+words[2]
142-
else:
143-
label_attrs[node] = words[0]+' '+words[1]+' '+words[2]
144-
148+
s = ''
149+
for word in words[:-1]:
150+
s += word+sep
151+
s += word[-1]
152+
label_attrs[node] = s
153+
154+
print("Complete. ")
145155
nx.set_node_attributes(TG.G, label_attrs, 'label')
146156
return TG
147157

temporal_grapher/weighted_clustering.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,13 @@ def square(t0, t, density, binwidth, epsilon = 0.1, params=(1,)):
2020
return out
2121

2222
def window(distance, width=1):
23-
# default to 10 because UMAP
2423
if np.abs(distance) < width:
2524
return (1/2)*(1+np.cos(np.pi*distance/width))
2625
else:
2726
return 0
2827

29-
def compute_point_rates(data, time, distances, width=1, sensitivity=1):
30-
data_width = np.mean(
31-
[np.amax(data[:,k])-np.amin(data[:,k])
32-
for k in range(data.shape[1])]
33-
)
34-
d_max = 100*data_width/np.size(time)
28+
def compute_point_rates(data, time, distances, width, sensitivity=1):
29+
d_max = width/np.size(time)
3530
lambdas = np.zeros(np.size(time))
3631
for i,d in enumerate(distances):
3732
t0 = time[i]
@@ -52,16 +47,19 @@ def compute_point_rates(data, time, distances, width=1, sensitivity=1):
5247
lambdas[i] = 0
5348
else:
5449
lambdas[i] = np.average(deltas, weights=time_weights)
50+
iso_idx = (lambdas == 0).nonzero()
5551
# apply the window:
5652
smoothed_lambdas = np.zeros(np.size(time))
5753
for j, d in enumerate(distances):
5854
val = 0
5955
norm = 0
60-
idx=(d<=10*d_max).nonzero()[0]
56+
idx=(d<=25*d_max).nonzero()[0]
6157
for i in idx:
62-
val += window(d[i], width)*lambdas[i]
63-
norm += window(d[i], width)
58+
val += window(d[i], 5*d_max)*lambdas[i]
59+
norm += window(d[i], 5*d_max)
6460
smoothed_lambdas[j] = val/norm
61+
62+
#smoothed_lambdas = lambdas
6563
iso_idx = (smoothed_lambdas == 0)
6664
rates = smoothed_lambdas
6765
rates[iso_idx] = np.inf

0 commit comments

Comments
 (0)