1111# Thierry Guillemot <[email protected] > 1212# Gregory Stupp <[email protected] > 1313# Joel Nothman <[email protected] > 14+ # Arya McCarthy <[email protected] > 1415# License: BSD 3 clause
1516
1617from __future__ import division
1718
1819from math import log
20+ import warnings
1921
2022import numpy as np
2123from scipy import sparse as sp
@@ -59,6 +61,21 @@ def check_clusterings(labels_true, labels_pred):
5961 return labels_true , labels_pred
6062
6163
64+ def _generalized_average (U , V , average_method ):
65+ """Return a particular mean of two numbers."""
66+ if average_method == "min" :
67+ return min (U , V )
68+ elif average_method == "geometric" :
69+ return np .sqrt (U * V )
70+ elif average_method == "arithmetic" :
71+ return np .mean ([U , V ])
72+ elif average_method == "max" :
73+ return max (U , V )
74+ else :
75+ raise ValueError ("'average_method' must be 'min', 'geometric', "
76+ "'arithmetic', or 'max'" )
77+
78+
6279def contingency_matrix (labels_true , labels_pred , eps = None , sparse = False ):
6380 """Build a contingency matrix describing the relationship between labels.
6481
@@ -245,7 +262,9 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred):
245262
246263 V-Measure is furthermore symmetric: swapping ``labels_true`` and
247264 ``label_pred`` will give the same score. This does not hold for
248- homogeneity and completeness.
265+ homogeneity and completeness. V-Measure is identical to
266+ :func:`normalized_mutual_info_score` with the arithmetic averaging
267+ method.
249268
250269 Read more in the :ref:`User Guide <homogeneity_completeness>`.
251270
@@ -444,7 +463,8 @@ def completeness_score(labels_true, labels_pred):
444463def v_measure_score (labels_true , labels_pred ):
445464 """V-measure cluster labeling given a ground truth.
446465
447- This score is identical to :func:`normalized_mutual_info_score`.
466+ This score is identical to :func:`normalized_mutual_info_score` with
467+ the ``'arithmetic'`` option for averaging.
448468
449469 The V-measure is the harmonic mean between homogeneity and completeness::
450470
@@ -459,6 +479,7 @@ def v_measure_score(labels_true, labels_pred):
459479 measure the agreement of two independent label assignments strategies
460480 on the same dataset when the real ground truth is not known.
461481
482+
462483 Read more in the :ref:`User Guide <homogeneity_completeness>`.
463484
464485 Parameters
@@ -485,6 +506,7 @@ def v_measure_score(labels_true, labels_pred):
485506 --------
486507 homogeneity_score
487508 completeness_score
509+ normalized_mutual_info_score
488510
489511 Examples
490512 --------
@@ -617,7 +639,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
617639 return mi .sum ()
618640
619641
620- def adjusted_mutual_info_score (labels_true , labels_pred ):
642+ def adjusted_mutual_info_score (labels_true , labels_pred ,
643+ average_method = 'warn' ):
621644 """Adjusted Mutual Information between two clusterings.
622645
623646 Adjusted Mutual Information (AMI) is an adjustment of the Mutual
@@ -626,7 +649,7 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
626649 clusters, regardless of whether there is actually more information shared.
627650 For two clusterings :math:`U` and :math:`V`, the AMI is given as::
628651
629- AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max (H(U), H(V)) - E(MI(U, V))]
652+ AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [avg (H(U), H(V)) - E(MI(U, V))]
630653
631654 This metric is independent of the absolute values of the labels:
632655 a permutation of the class or cluster label values won't change the
@@ -650,9 +673,17 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
650673 labels_pred : array, shape = [n_samples]
651674 A clustering of the data into disjoint subsets.
652675
676+ average_method : string, optional (default: 'warn')
677+ How to compute the normalizer in the denominator. Possible options
678+ are 'min', 'geometric', 'arithmetic', and 'max'.
679+ If 'warn', 'max' will be used. The default will change to
680+ 'arithmetic' in version 0.22.
681+
682+ .. versionadded:: 0.20
683+
653684 Returns
654685 -------
655- ami: float(upperlimited by 1.0)
686+ ami: float (upperlimited by 1.0)
656687 The AMI returns a value of 1 when the two partitions are identical
657688 (ie perfectly matched). Random partitions (independent labellings) have
658689 an expected AMI around 0 on average hence can be negative.
@@ -691,6 +722,12 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
691722 <https://en.wikipedia.org/wiki/Adjusted_Mutual_Information>`_
692723
693724 """
725+ if average_method == 'warn' :
726+ warnings .warn ("The behavior of AMI will change in version 0.22. "
727+ "To match the behavior of 'v_measure_score', AMI will "
728+ "use average_method='arithmetic' by default." ,
729+ FutureWarning )
730+ average_method = 'max'
694731 labels_true , labels_pred = check_clusterings (labels_true , labels_pred )
695732 n_samples = labels_true .shape [0 ]
696733 classes = np .unique (labels_true )
@@ -709,17 +746,29 @@ def adjusted_mutual_info_score(labels_true, labels_pred):
709746 emi = expected_mutual_information (contingency , n_samples )
710747 # Calculate entropy for each labeling
711748 h_true , h_pred = entropy (labels_true ), entropy (labels_pred )
712- ami = (mi - emi ) / (max (h_true , h_pred ) - emi )
749+ normalizer = _generalized_average (h_true , h_pred , average_method )
750+ denominator = normalizer - emi
751+ # Avoid 0.0 / 0.0 when expectation equals maximum, i.e a perfect match.
752+ # normalizer should always be >= emi, but because of floating-point
753+ # representation, sometimes emi is slightly larger. Correct this
754+ # by preserving the sign.
755+ if denominator < 0 :
756+ denominator = min (denominator , - np .finfo ('float64' ).eps )
757+ else :
758+ denominator = max (denominator , np .finfo ('float64' ).eps )
759+ ami = (mi - emi ) / denominator
713760 return ami
714761
715762
716- def normalized_mutual_info_score (labels_true , labels_pred ):
763+ def normalized_mutual_info_score (labels_true , labels_pred ,
764+ average_method = 'warn' ):
717765 """Normalized Mutual Information between two clusterings.
718766
719767 Normalized Mutual Information (NMI) is an normalization of the Mutual
720768 Information (MI) score to scale the results between 0 (no mutual
721769 information) and 1 (perfect correlation). In this function, mutual
722- information is normalized by ``sqrt(H(labels_true) * H(labels_pred))``.
770+ information is normalized by some generalized mean of ``H(labels_true)``
771+ and ``H(labels_pred))``, defined by the `average_method`.
723772
724773 This measure is not adjusted for chance. Therefore
725774 :func:`adjusted_mustual_info_score` might be preferred.
@@ -743,13 +792,22 @@ def normalized_mutual_info_score(labels_true, labels_pred):
743792 labels_pred : array, shape = [n_samples]
744793 A clustering of the data into disjoint subsets.
745794
795+ average_method : string, optional (default: 'warn')
796+ How to compute the normalizer in the denominator. Possible options
797+ are 'min', 'geometric', 'arithmetic', and 'max'.
798+ If 'warn', 'geometric' will be used. The default will change to
799+ 'arithmetic' in version 0.22.
800+
801+ .. versionadded:: 0.20
802+
746803 Returns
747804 -------
748805 nmi : float
749806 score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling
750807
751808 See also
752809 --------
810+ v_measure_score: V-Measure (NMI with arithmetic mean option.)
753811 adjusted_rand_score: Adjusted Rand Index
754812 adjusted_mutual_info_score: Adjusted Mutual Information (adjusted
755813 against chance)
@@ -773,6 +831,12 @@ def normalized_mutual_info_score(labels_true, labels_pred):
773831 0.0
774832
775833 """
834+ if average_method == 'warn' :
835+ warnings .warn ("The behavior of NMI will change in version 0.22. "
836+ "To match the behavior of 'v_measure_score', NMI will "
837+ "use average_method='arithmetic' by default." ,
838+ FutureWarning )
839+ average_method = 'geometric'
776840 labels_true , labels_pred = check_clusterings (labels_true , labels_pred )
777841 classes = np .unique (labels_true )
778842 clusters = np .unique (labels_pred )
@@ -789,7 +853,10 @@ def normalized_mutual_info_score(labels_true, labels_pred):
789853 # Calculate the expected value for the mutual information
790854 # Calculate entropy for each labeling
791855 h_true , h_pred = entropy (labels_true ), entropy (labels_pred )
792- nmi = mi / max (np .sqrt (h_true * h_pred ), 1e-10 )
856+ normalizer = _generalized_average (h_true , h_pred , average_method )
857+ # Avoid 0.0 / 0.0 when either entropy is zero.
858+ normalizer = max (normalizer , np .finfo ('float64' ).eps )
859+ nmi = mi / normalizer
793860 return nmi
794861
795862
0 commit comments