@@ -72,6 +72,7 @@ def _tree_to_labels(
72
72
match_reference_implementation = False ,
73
73
cluster_selection_epsilon = 0.0 ,
74
74
max_cluster_size = 0 ,
75
+ cluster_selection_epsilon_max = float ('inf' ),
75
76
):
76
77
"""Converts a pretrained tree and cluster size into a
77
78
set of labels and probabilities.
@@ -86,6 +87,7 @@ def _tree_to_labels(
86
87
match_reference_implementation ,
87
88
cluster_selection_epsilon ,
88
89
max_cluster_size ,
90
+ cluster_selection_epsilon_max ,
89
91
)
90
92
91
93
return (labels , probabilities , stabilities , condensed_tree , single_linkage_tree )
@@ -529,6 +531,7 @@ def hdbscan(
529
531
cluster_selection_method = "eom" ,
530
532
allow_single_cluster = False ,
531
533
match_reference_implementation = False ,
534
+ cluster_selection_epsilon_max = float ('inf' ),
532
535
** kwargs
533
536
):
534
537
"""Perform HDBSCAN clustering from a vector array or distance matrix.
@@ -555,7 +558,7 @@ def hdbscan(
555
558
See [3]_ for more information. Note that this should not be used
556
559
if we want to predict the cluster labels for new points in future
557
560
(e.g. using approximate_predict), as the approximate_predict function
558
- is not aware of this argument.
561
+ is not aware of this argument. This is the minimum epsilon allowed.
559
562
560
563
alpha : float, optional (default=1.0)
561
564
A distance scaling parameter as used in robust single linkage.
@@ -641,6 +644,16 @@ def hdbscan(
641
644
performance cost, ensure that the clustering results match the
642
645
reference implementation.
643
646
647
+ cluster_selection_epsilon_max: float, optional (default=inf)
648
+ A distance threshold. Clusters above this value will be split.
649
+ Has no effect when using leaf clustering (where clusters are
650
+ usually small regardless) and can also be overridden in rare
651
+ cases by a high value for cluster_selection_epsilon. Note that
652
+ this should not be used if we want to predict the cluster labels
653
+ for new points in future (e.g. using approximate_predict), as
654
+ the approximate_predict function is not aware of this argument.
655
+ This is the maximum epsilon allowed.
656
+
644
657
**kwargs : optional
645
658
Arguments passed to the distance metric
646
659
@@ -722,6 +735,9 @@ def hdbscan(
722
735
"Minkowski metric with negative p value is not" " defined!"
723
736
)
724
737
738
+ if cluster_selection_epsilon_max < cluster_selection_epsilon :
739
+ raise ValueError ("Cluster selection epsilon max must be greater than epsilon!" )
740
+
725
741
if match_reference_implementation :
726
742
min_samples = min_samples - 1
727
743
min_cluster_size = min_cluster_size + 1
@@ -891,6 +907,7 @@ def hdbscan(
891
907
match_reference_implementation ,
892
908
cluster_selection_epsilon ,
893
909
max_cluster_size ,
910
+ cluster_selection_epsilon_max ,
894
911
)
895
912
+ (result_min_span_tree ,)
896
913
)
@@ -934,6 +951,7 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
934
951
935
952
cluster_selection_epsilon: float, optional (default=0.0)
936
953
A distance threshold. Clusters below this value will be merged.
954
+ This is the minimum epsilon allowed.
937
955
See [5]_ for more information.
938
956
939
957
algorithm : string, optional (default='best')
@@ -1010,6 +1028,16 @@ class HDBSCAN(BaseEstimator, ClusterMixin):
1010
1028
performance cost, ensure that the clustering results match the
1011
1029
reference implementation.
1012
1030
1031
+ cluster_selection_epsilon_max: float, optional (default=inf)
1032
+ A distance threshold. Clusters above this value will be split.
1033
+ Has no effect when using leaf clustering (where clusters are
1034
+ usually small regardless) and can also be overridden in rare
1035
+ cases by a high value for cluster_selection_epsilon. Note that
1036
+ this should not be used if we want to predict the cluster labels
1037
+ for new points in future (e.g. using approximate_predict), as
1038
+ the approximate_predict function is not aware of this argument.
1039
+ This is the maximum epsilon allowed.
1040
+
1013
1041
**kwargs : optional
1014
1042
Arguments passed to the distance metric
1015
1043
@@ -1127,6 +1155,7 @@ def __init__(
1127
1155
prediction_data = False ,
1128
1156
branch_detection_data = False ,
1129
1157
match_reference_implementation = False ,
1158
+ cluster_selection_epsilon_max = float ('inf' ),
1130
1159
** kwargs
1131
1160
):
1132
1161
self .min_cluster_size = min_cluster_size
@@ -1147,6 +1176,7 @@ def __init__(
1147
1176
self .match_reference_implementation = match_reference_implementation
1148
1177
self .prediction_data = prediction_data
1149
1178
self .branch_detection_data = branch_detection_data
1179
+ self .cluster_selection_epsilon_max = cluster_selection_epsilon_max
1150
1180
1151
1181
self ._metric_kwargs = kwargs
1152
1182
@@ -1296,7 +1326,7 @@ def generate_prediction_data(self):
1296
1326
def generate_branch_detection_data (self ):
1297
1327
"""
1298
1328
Create data that caches intermediate results used for detecting
1299
- branches within clusters. This data is only useful if you are
1329
+ branches within clusters. This data is only useful if you are
1300
1330
intending to use functions from ``hdbscan.branches``.
1301
1331
"""
1302
1332
if self .metric in FAST_METRICS :
0 commit comments