Skip to content

Commit 5dab8e3

Browse files
authored
Merge pull request #529 from prodrigues-tdx/master
Add new argument for limiting the maximum epsilon
2 parents aef934c + c101732 commit 5dab8e3

File tree

3 files changed

+92
-8
lines changed

3 files changed

+92
-8
lines changed

hdbscan/_hdbscan_tree.pyx

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,11 @@ cdef np.ndarray[np.intp_t, ndim=1] do_labelling(
515515
if cluster < root_cluster:
516516
result[n] = -1
517517
elif cluster == root_cluster:
518-
if len(clusters) == 1 and allow_single_cluster:
518+
if len(clusters) == 1 and allow_single_cluster and cluster in cluster_label_map:
519+
# check if `cluster` still exists in `cluster_label_map` and that it was not pruned
520+
# by `max_cluster_size` or `cluster_selection_epsilon_max` before executing this
519521
if cluster_selection_epsilon != 0.0:
520-
if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon :
522+
if tree['lambda_val'][tree['child'] == n] >= 1 / cluster_selection_epsilon:
521523
result[n] = cluster_label_map[cluster]
522524
else:
523525
result[n] = -1
@@ -792,7 +794,8 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
792794
allow_single_cluster=False,
793795
match_reference_implementation=False,
794796
cluster_selection_epsilon=0.0,
795-
max_cluster_size=0):
797+
max_cluster_size=0,
798+
cluster_selection_epsilon_max=float('inf')):
796799
"""Given a tree and stability dict, produce the cluster labels
797800
(and probabilities) for a flat clustering based on the chosen
798801
cluster selection method.
@@ -819,13 +822,18 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
819822
certain edge cases.
820823
821824
cluster_selection_epsilon: float, optional (default 0.0)
822-
A distance threshold for cluster splits.
823-
825+
A distance threshold for cluster splits. This is the minimum
826+
epsilon allowed.
827+
824828
max_cluster_size: int, optional (default 0)
825829
The maximum size for clusters located by the EOM clusterer. Can
826830
be overridden by the cluster_selection_epsilon parameter in
827831
rare cases.
828832
833+
cluster_selection_epsilon_max: float, optional (default inf)
834+
A distance threshold for cluster splits. This is the maximum
835+
epsilon allowed.
836+
829837
Returns
830838
-------
831839
labels : ndarray (n_samples,)
@@ -842,6 +850,7 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
842850
cdef np.ndarray child_selection
843851
cdef dict is_cluster
844852
cdef dict cluster_sizes
853+
cdef dict node_eps
845854
cdef float subtree_stability
846855
cdef np.intp_t node
847856
cdef np.intp_t sub_node
@@ -872,18 +881,21 @@ cpdef tuple get_clusters(np.ndarray tree, dict stability,
872881
max_cluster_size = num_points + 1 # Set to a value that will never be triggered
873882
cluster_sizes = {child: child_size for child, child_size
874883
in zip(cluster_tree['child'], cluster_tree['child_size'])}
884+
node_eps = {child: 1/l for child, l
885+
in zip(cluster_tree['child'], cluster_tree['lambda_val'])}
875886
if allow_single_cluster:
876887
# Compute cluster size for the root node
877888
cluster_sizes[node_list[-1]] = np.sum(
878889
cluster_tree[cluster_tree['parent'] == node_list[-1]]['child_size'])
890+
node_eps[node_list[-1]] = np.max(1.0 / tree['lambda_val'])
879891

880892
if cluster_selection_method == 'eom':
881893
for node in node_list:
882894
child_selection = (cluster_tree['parent'] == node)
883895
subtree_stability = np.sum([
884896
stability[child] for
885897
child in cluster_tree['child'][child_selection]])
886-
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
898+
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size or node_eps[node] > cluster_selection_epsilon_max:
887899
is_cluster[node] = False
888900
stability[node] = subtree_stability
889901
else:

hdbscan/hdbscan_.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def _tree_to_labels(
7272
match_reference_implementation=False,
7373
cluster_selection_epsilon=0.0,
7474
max_cluster_size=0,
75+
cluster_selection_epsilon_max=float('inf'),
7576
):
7677
"""Converts a pretrained tree and cluster size into a
7778
set of labels and probabilities.
@@ -86,6 +87,7 @@ def _tree_to_labels(
8687
match_reference_implementation,
8788
cluster_selection_epsilon,
8889
max_cluster_size,
90+
cluster_selection_epsilon_max,
8991
)
9092

9193
return (labels, probabilities, stabilities, condensed_tree, single_linkage_tree)
@@ -529,6 +531,7 @@ def hdbscan(
529531
cluster_selection_method="eom",
530532
allow_single_cluster=False,
531533
match_reference_implementation=False,
534+
cluster_selection_epsilon_max=float('inf'),
532535
**kwargs
533536
):
534537
"""Perform HDBSCAN clustering from a vector array or distance matrix.
@@ -555,7 +558,7 @@ def hdbscan(
555558
See [3]_ for more information. Note that this should not be used
556559
if we want to predict the cluster labels for new points in future
557560
(e.g. using approximate_predict), as the approximate_predict function
558-
is not aware of this argument.
561+
is not aware of this argument. This is the minimum epsilon allowed.
559562
560563
alpha : float, optional (default=1.0)
561564
A distance scaling parameter as used in robust single linkage.
@@ -641,6 +644,16 @@ def hdbscan(
641644
performance cost, ensure that the clustering results match the
642645
reference implementation.
643646
647+
cluster_selection_epsilon_max: float, optional (default=inf)
648+
A distance threshold. Clusters above this value will be split.
649+
Has no effect when using leaf clustering (where clusters are
650+
usually small regardless) and can also be overridden in rare
651+
cases by a high value for cluster_selection_epsilon. Note that
652+
this should not be used if we want to predict the cluster labels
653+
for new points in future (e.g. using approximate_predict), as
654+
the approximate_predict function is not aware of this argument.
655+
This is the maximum epsilon allowed.
656+
644657
**kwargs : optional
645658
Arguments passed to the distance metric
646659
@@ -722,6 +735,9 @@ def hdbscan(
722735
"Minkowski metric with negative p value is not" " defined!"
723736
)
724737

738+
if cluster_selection_epsilon_max < cluster_selection_epsilon:
739+
raise ValueError("Cluster selection epsilon max must be greater than epsilon!")
740+
725741
if match_reference_implementation:
726742
min_samples = min_samples - 1
727743
min_cluster_size = min_cluster_size + 1
@@ -891,6 +907,7 @@ def hdbscan(
891907
match_reference_implementation,
892908
cluster_selection_epsilon,
893909
max_cluster_size,
910+
cluster_selection_epsilon_max,
894911
)
895912
+ (result_min_span_tree,)
896913
)
@@ -934,6 +951,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
934951
935952
cluster_selection_epsilon: float, optional (default=0.0)
936953
A distance threshold. Clusters below this value will be merged.
954+
This is the minimum epsilon allowed.
937955
See [5]_ for more information.
938956
939957
algorithm : string, optional (default='best')
@@ -1010,6 +1028,16 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
10101028
performance cost, ensure that the clustering results match the
10111029
reference implementation.
10121030
1031+
cluster_selection_epsilon_max: float, optional (default=inf)
1032+
A distance threshold. Clusters above this value will be split.
1033+
Has no effect when using leaf clustering (where clusters are
1034+
usually small regardless) and can also be overridden in rare
1035+
cases by a high value for cluster_selection_epsilon. Note that
1036+
this should not be used if we want to predict the cluster labels
1037+
for new points in future (e.g. using approximate_predict), as
1038+
the approximate_predict function is not aware of this argument.
1039+
This is the maximum epsilon allowed.
1040+
10131041
**kwargs : optional
10141042
Arguments passed to the distance metric
10151043
@@ -1127,6 +1155,7 @@ def __init__(
11271155
prediction_data=False,
11281156
branch_detection_data=False,
11291157
match_reference_implementation=False,
1158+
cluster_selection_epsilon_max=float('inf'),
11301159
**kwargs
11311160
):
11321161
self.min_cluster_size = min_cluster_size
@@ -1147,6 +1176,7 @@ def __init__(
11471176
self.match_reference_implementation = match_reference_implementation
11481177
self.prediction_data = prediction_data
11491178
self.branch_detection_data = branch_detection_data
1179+
self.cluster_selection_epsilon_max = cluster_selection_epsilon_max
11501180

11511181
self._metric_kwargs = kwargs
11521182

@@ -1296,7 +1326,7 @@ def generate_prediction_data(self):
12961326
def generate_branch_detection_data(self):
12971327
"""
12981328
Create data that caches intermediate results used for detecting
1299-
branches within clusters. This data is only useful if you are
1329+
branches within clusters. This data is only useful if you are
13001330
intending to use functions from ``hdbscan.branches``.
13011331
"""
13021332
if self.metric in FAST_METRICS:

hdbscan/tests/test_hdbscan.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,7 @@ def test_hdbscan_badargs():
581581
assert_raises(Exception, hdbscan, X, algorithm="something_else")
582582
assert_raises(TypeError, hdbscan, X, metric="minkowski", p=None)
583583
assert_raises(ValueError, hdbscan, X, leaf_size=0)
584+
assert_raises(ValueError, hdbscan, X, cluster_selection_epsilon_max=-1)
584585

585586

586587
def test_hdbscan_sparse():
@@ -648,6 +649,47 @@ def test_hdbscan_allow_single_cluster_with_epsilon():
648649
assert counts[unique_labels == -1] == 2
649650

650651

652+
def test_hdbscan_cluster_selection_epsilon_max():
653+
"""Test that reducing the cluster_selection_epsilon_max parameter
654+
results in more clusters with smaller sizes being found."""
655+
blobs, _ = make_blobs(n_samples=50,
656+
centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
657+
cluster_std=0.2,
658+
random_state=42)
659+
660+
clusterer = HDBSCAN(cluster_selection_epsilon_max=2.0,
661+
allow_single_cluster=True)
662+
clusterer.fit(blobs)
663+
664+
assert_array_equal(np.unique(clusterer.labels_), np.array([0, 1]))
665+
666+
clusterer = HDBSCAN(cluster_selection_epsilon_max=1.0,
667+
allow_single_cluster=True)
668+
clusterer.fit(blobs)
669+
670+
assert_array_equal(np.unique(clusterer.labels_), np.array([-1, 0, 1, 2, 3]))
671+
672+
673+
def test_hdbscan_parameters_do_not_trigger_errors():
674+
blobs, _ = make_blobs(n_samples=50,
675+
centers=[(1, 0), (-1, 0), (-1, 1), (1, 1)],
676+
cluster_std=0.2,
677+
random_state=42)
678+
clusterer = HDBSCAN(max_cluster_size=10,
679+
allow_single_cluster=True)
680+
681+
# If the following line does not raise an error, the test passes
682+
clusterer.fit(blobs)
683+
assert True
684+
685+
clusterer = HDBSCAN(cluster_selection_epsilon_max=0.41,
686+
cluster_selection_epsilon=0.4,
687+
allow_single_cluster=True)
688+
689+
# If the following line does not raise an error, the test passes
690+
clusterer.fit(blobs)
691+
assert True
692+
651693
# Disable for now -- need to refactor to meet newer standards
652694
@pytest.mark.skip(reason="need to refactor to meet newer standards")
653695
def test_hdbscan_is_sklearn_estimator():

0 commit comments

Comments
 (0)