55from weighted_clustering import *
66from tqdm import tqdm , trange
77from sklearn .metrics import pairwise_distances
8+ from sklearn .preprocessing import StandardScaler
9+ from scipy .sparse import issparse
810
911'''TemporalGraph class
1012minimal usage example:
1113
1214 # load from your data file:
13- data : (n_dim, N_data) array
14- time : (N_data,) array
15- semantic_dist : (N_data,) array
15+ data : (n_dim, N_data) array-like
16+ time : (N_data,) array-like
17+ semantic_dist : (N_data,) array-like
1618 # choose an sklearn clusterer:
1719 clusterer = HDBSCAN()
1820
@@ -34,58 +36,70 @@ class TemporalGraph():
3436
3537 Attributes
3638 ----------
37- time : ndarray
38- time array (1 dim)
39- data : ndarray
40- data array (n dim)
41- clusterer : sklearn clusterer
42- the clusterer to use for the slice-wise clustering, must accept sample_weights
43- N_checkpoints : int
44- number of time-points at which to cluster
45- checkpoints : arraylike
46- array of time-points at which to cluster
47- show_outliers : bool
48- If true, include unclustered points in the graph
49- slice_method : str
50- One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
51- generates N_checkpoints such that there are equal amounts of data between the points.
52- rate_sensitivity : float
53- A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
54- means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1,
55- then the rate parameter is taken log2.
56- kernel : function
57- A function with signiture f(t0, t, density, binwidth, epsilon=0.01, params=None).
58- Two options are included in weighted_clustering.py, `weighted_clustering.square` and
59- `weighted_clustering.gaussian`.
60- kernel_parameters : tuple or None,
61- Passed to `kernel` as params kwarg.
62- precomputed_distances : ndarray
63- an (n_data, n_data) array of pairwise distances between points. If None then it will
64- be computed using `sklearn.metrics.pairwise_distances`.
65- verbose : bool
66- Does what you expect.
67-
68-
6939 G : networkx.classes.Digraph(Graph)
7040 The temporal graph itself.
41+ density : ndarray
42+ The f-density \r ho for each data point.
43+
7144 Methods
7245 -------
7346 build(ydata=None):
7447 Perform all operations necessary to construct the graph.
7548 """
49+
7650 def __init__ (
7751 self , time , data , clusterer ,
78- N_checkpoints = None ,
79- clusters = None ,
80- checkpoints = None ,
81- show_outliers = False ,
52+ N_checkpoints = None ,
53+ resolution = 10 ,
54+ overlap = 0.5 ,
55+ clusters = None ,
56+ checkpoints = None ,
57+ show_outliers = False ,
8258 slice_method = 'time' ,
8359 rate_sensitivity = 1 ,
8460 kernel = gaussian ,
8561 kernel_params = None ,
86- precomputed_distances = None ,
87- verbose = False ,
62+ verbose = False ,
8863 ):
64+ """
65+ Parameters
66+ ----------
67+ time : ndarray
68+ time array (1 dim)
69+ data : ndarray
70+ data array (n dim)
71+ clusterer : sklearn clusterer
72+ the clusterer to use for the slice-wise clustering, must accept sample_weights
73+ N_checkpoints : int
74+ number of time-points at which to cluster
75+ checkpoints : arraylike
76+ array of time-points at which to cluster
77+ overlap : float
78+ A float in (0,1) which specifies the `g` parameter (see README)
79+ resolution: float
80+ Determines the distance around each point which we use as a neighbourhood for
81+ determining the f-rate. If you get a warning about isolated points, you should
82+ increase this parameter. If you plot the density and it is not very smooth
83+ you can increase this parameter.
84+ show_outliers : bool
85+ If true, include unclustered points in the graph
86+ slice_method : str
87+ One of 'time' or 'data'. If time, generates N_checkpoints evenly spaced in time. If data,
88+ generates N_checkpoints such that there are equal amounts of data between the points.
89+ rate_sensitivity : float
90+ A positive float, or -1. The rate parameter is raised to this parameter, so higher numbers
91+ means that the algorithm is more sensitive to changes in rate. If rate_sensivity == -1,
92+ then the rate parameter is taken log2.
93+ kernel : function
94+ A function with signature f(t0, t, density, binwidth, epsilon=0.01, params=None).
95+ Two options are included in weighted_clustering.py, `weighted_clustering.square` and
96+ `weighted_clustering.gaussian`.
97+ kernel_parameters : tuple or None,
98+ Passed to `kernel` as params kwarg.
99+ verbose : bool
100+ Does what you expect.
101+
102+ """
89103 if np .size (time ) != np .shape (data )[0 ]:
90104 raise AttributeError ("Number of datapoints" ,
91105 np .shape (data )[0 ],
@@ -98,18 +112,20 @@ def __init__(
98112 if len (data .shape ) == 1 :
99113 data = data .reshape (- 1 ,1 )
100114 self .n_components = data .shape [1 ]
101- self .data = data
115+ if issparse (data ):
116+ self .scaler = StandardScaler (copy = False , with_mean = False )
117+ else :
118+ self .scaler = StandardScaler (copy = False )
119+ self .data = self .scaler .fit_transform (data )
102120 self .checkpoints = checkpoints
103121 if slice_method in ['time' ,'data' ]:
104122 self .slice_method = slice_method
105123 else :
106124 raise AttributeError ("Accepted slice_method is 'time' or 'data'." )
107125 if checkpoints is not None :
108126 self .N_checkpoints = np .size (checkpoints )
109- if (np .size (slices ) == N_slices ):
110- self .N_checkpoints = N_checkpoints
111- else :
112- raise AttributeError ("If you pass checkpoints and N_checkpoints, then len(checkpoints) must equal N_checkpoints." )
127+ if not (self .N_checkpoints == N_checkpoints ):
128+ raise AttributeError ("Given checkpoints and N_checkpoints, len(checkpoints) must equal N_checkpoints." )
113129 else :
114130 if N_checkpoints is not None :
115131 self .N_checkpoints = N_checkpoints
@@ -118,7 +134,8 @@ def __init__(
118134
119135 self .clusterer = clusterer
120136 self .clusters = clusters
121- self .densities = None
137+ self .g = overlap
138+ self .density = None
122139 self .sensitivity = rate_sensitivity
123140 self .kernel = kernel
124141 self .kernel_params = kernel_params
@@ -128,9 +145,10 @@ def __init__(
128145 self .verbose = verbose
129146 self .disable = not verbose # tqdm
130147 self .show_outliers = False
131- self .distances = precomputed_distances
132- if precomputed_distances is None :
133- self .distances = pairwise_distances (data )
148+ self .resolution = resolution
149+ if self .verbose :
150+ print ("Computing pairwise distances..." )
151+ self .distance = pairwise_distances (data )
134152
135153 def _compute_checkpoints (self ):
136154 if self .slice_method == 'data' :
@@ -142,7 +160,7 @@ def _compute_checkpoints(self):
142160 self .checkpoints = checkpoints
143161 return checkpoints
144162
145- def _compute_densities (self ):
163+ def _compute_density (self ):
146164 if self .checkpoints is None :
147165 self ._compute_checkpoints ()
148166 if self .verbose :
@@ -154,33 +172,33 @@ def _compute_densities(self):
154172 rates = compute_point_rates (
155173 self .data ,
156174 self .time ,
157- self .distances ,
175+ self .distance ,
176+ self .resolution * data_width ,
158177 sensitivity = self .sensitivity ,
159- width = data_width / 10 ,
160178 )
161179 iso_idx = (rates == np .inf )
162180 nisolated = np .size ((iso_idx ).nonzero ())
163181 if nisolated != 0 :
164- print (f'Warning: You have { nisolated } isolated points. If this is a small number, its probably fine.' )
165- densities = 1 / rates
166- densities = sigmoid ( densities , np . median ( densities ) )
182+ print (f'Warning: You have { nisolated } isolated points. If this is a small number, its probably fine. Otherwise, increase the resolution parameter. ' )
183+ density = 1 / rates
184+ density = std_sigmoid ( density )
167185 if self .sensitivity == - 1 :
168- self .densities = 1 / (1 - np .log2 (densities ))
186+ self .density = 1 / (1 - np .log2 (density ))
169187 else :
170- self .densities = densities ** self .sensitivity
171- self .densities [iso_idx ] = 0
172- return self .densities
188+ self .density = density ** self .sensitivity
189+ self .density [iso_idx ] = np . amin ( density [ ~ iso_idx ])
190+ return self .density
173191
174192 def _cluster (self ):
175- if self .densities is None :
176- self ._compute_densities ()
193+ if self .density is None :
194+ self ._compute_density ()
177195 if self .verbose :
178196 print ("Clusting at each time slice..." )
179197 clusters , weights = weighted_clusters (
180198 self .data ,
181199 self .time ,
182200 self .checkpoints ,
183- self .densities ,
201+ self .density ,
184202 self .clusterer ,
185203 self .kernel ,
186204 self .kernel_params ,
@@ -279,7 +297,7 @@ def build_adj_matrix(self):
279297 adj_mat [l ][k ] += self .kernel (
280298 time_centers [i ],
281299 self .time [j ],
282- self .densities [j ],
300+ self .density [j ],
283301 bin_width [i ],
284302 params = self .kernel_params
285303 )
0 commit comments