Skip to content

Commit 880aa65

Browse files
committed
Cleaning things up and adding more tests
1 parent febab6d commit 880aa65

File tree

2 files changed

+120
-52
lines changed

2 files changed

+120
-52
lines changed

src/skmatter/decomposition/_pcovc.py

Lines changed: 58 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
# did a search of all classifiers that inherit from MultiOutputMixin - none of them implement
2929
# decision function, so I don't think we need to inherit
3030

31+
3132
class PCovC(LinearClassifierMixin, _BasePCov):
3233
r"""Principal Covariates Classification (PCovC).
3334
@@ -178,16 +179,11 @@ class PCovC(LinearClassifierMixin, _BasePCov):
178179
179180
pxz_ : ndarray of size :math:`({n_{features}, })`, :math:`({n_{features}, n_{classes}})`
180181
the projector, or weights, from the input space :math:`\mathbf{X}`
181-
to the class confidence scores :math:`\mathbf{Z}`. In the multioutput case,
182-
has shape , :math:`({n_{features}, n_{classes}*n_{outputs}})`, a flattened form
183-
of a 3D tensor.
182+
to the class confidence scores :math:`\mathbf{Z}`.
184183
185-
ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})` \
186-
or :math:`({n_{components}, n_{classes}*n_{outputs}})`
187-
the projector, or weights, from the latent-space projection
188-
:math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}`.
189-
In the multioutput case, has shape , :math:`({n_{components}, n_{classes}*n_{outputs}})`,
190-
a flattened form of a 3D tensor.
184+
ptz_ : ndarray of size :math:`({n_{components}, })`, :math:`({n_{components}, n_{classes}})`
185+
the projector, or weights, from from the latent-space projection
186+
:math:`\mathbf{T}` to the class confidence scores :math:`\mathbf{Z}`.
191187
192188
explained_variance_ : numpy.ndarray of shape (n_components,)
193189
The amount of variance explained by each of the selected components.
@@ -279,7 +275,7 @@ def fit(self, X, Y, W=None):
279275
`` W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])``.
280276
"""
281277
X, Y = validate_data(self, X, Y, multi_output=True, y_numeric=False)
282-
278+
283279
check_classification_targets(Y)
284280
self.classes_ = np.unique(Y)
285281
self.n_outputs = 1 if Y.ndim == 1 else Y.shape[1]
@@ -305,33 +301,51 @@ def fit(self, X, Y, W=None):
305301
"Classifier must be an instance of `"
306302
f"{'`, `'.join(c.__name__ for c in compatible_classifiers)}`"
307303
", or `precomputed`"
308-
)
304+
)
309305

310-
# if self.n_outputs == 1:
311-
# classifier = LogisticRegression()
312-
# else:
313-
# classifier = MultiOutputClassifier(estimator=LogisticRegression())
306+
if self.n_outputs == 1 and isinstance(self.classifier, MultiOutputClassifier):
307+
raise ValueError(
308+
"Classifier cannot be an instance of `MultiOutputClassifier` when Y is 1D"
309+
)
310+
311+
if (
312+
self.n_outputs != 1
313+
and self.classifier not in ["precomputed", None]
314+
and not (
315+
isinstance(self.classifier, MultiOutputClassifier)
316+
or self.classifier == "precomputed"
317+
)
318+
):
319+
raise ValueError(
320+
"Classifier must be an instance of `MultiOutputClassifier` when Y is 2D"
321+
)
314322

315-
# if self.classifier == "precomputed":
316-
323+
if self.n_outputs == 1:
324+
if self.classifier != "precomputed":
325+
classifier = self.classifier or LogisticRegression()
326+
self.z_classifier_ = check_cl_fit(classifier, X, Y)
327+
W = self.z_classifier_.coef_.T
317328

318-
if self.classifier != "precomputed":
319-
if self.classifier is None:
320-
classifier = LogisticRegression()
321329
else:
322-
classifier = self.classifier
323-
324-
self.z_classifier_ = check_cl_fit(classifier, X, Y)
325-
W = self.z_classifier_.coef_.T
330+
# to be used later on as the classifier fit between T and Y
331+
classifier = LogisticRegression()
332+
if W is None:
333+
W = clone(classifier).fit(X, Y).coef_.T
326334

327335
else:
328-
# If precomputed, use default classifier to predict Y from T
329-
classifier = LogisticRegression()
330-
if W is None:
331-
W = LogisticRegression().fit(X, Y).coef_.T
336+
if self.classifier != "precomputed":
337+
classifier = self.classifier or MultiOutputClassifier(
338+
estimator=LogisticRegression()
339+
)
340+
self.z_classifier_ = check_cl_fit(classifier, X, Y)
341+
W = np.hstack([est_.coef_.T for est_ in self.z_classifier_.estimators_])
332342

333-
print(f"X: {X.shape}")
334-
print(f"W: {W.shape}")
343+
else:
344+
# to be used later on as the classifier fit between T and Y
345+
classifier = MultiOutputClassifier(estimator=LogisticRegression())
346+
if W is None:
347+
_ = clone(classifier).fit(X, Y)
348+
W = np.hstack([_.coef_.T for _ in _.estimators_])
335349

336350
Z = X @ W
337351

@@ -344,7 +358,11 @@ def fit(self, X, Y, W=None):
344358
# classifier and steal weights to get pxz and ptz
345359
self.classifier_ = clone(classifier).fit(X @ self.pxt_, Y)
346360

347-
if isinstance(self.classifier_, MultiOutputClassifier):
361+
if self.n_outputs == 1:
362+
self.ptz_ = self.classifier_.coef_.T
363+
# print(self.ptz_.shape)
364+
self.pxz_ = self.pxt_ @ self.ptz_
365+
else:
348366
self.ptz_ = np.hstack(
349367
[est_.coef_.T for est_ in self.classifier_.estimators_]
350368
)
@@ -353,12 +371,7 @@ def fit(self, X, Y, W=None):
353371
self.pxz_ = self.pxt_ @ self.ptz_
354372
# print(f"pxz {self.pxz_.shape}")
355373

356-
else:
357-
self.ptz_ = self.classifier_.coef_.T
358-
# print(self.ptz_.shape)
359-
self.pxz_ = self.pxt_ @ self.ptz_
360-
361-
print(self.ptz_.shape)
374+
# print(self.ptz_.shape)
362375
if len(Y.shape) == 1 and type_of_target(Y) == "binary":
363376
self.pxz_ = self.pxz_.reshape(
364377
X.shape[1],
@@ -460,7 +473,7 @@ def decision_function(self, X=None, T=None):
460473
n_outputs such arrays if n_outputs > 1
461474
Confidence scores. For binary classification, has shape `(n_samples,)`,
462475
for multiclass classification, has shape `(n_samples, n_classes)`. If n_outputs > 1,
463-
the list returned can contain arrays with differing shapes depending on the
476+
the list can contain arrays with differing shapes depending on the
464477
number of classes in each output of Y.
465478
"""
466479
check_is_fitted(self, attributes=["pxz_", "ptz_"])
@@ -471,25 +484,24 @@ def decision_function(self, X=None, T=None):
471484
if X is not None:
472485
X = validate_data(self, X, reset=False)
473486

474-
# this is similar to how MultiOutputClassifier handles predict_proba() if n_outputs > 1
475-
if isinstance(self.classifier_, MultiOutputClassifier):
487+
if self.n_outputs == 1:
488+
# Or self.classifier_.decision_function(X @ self.pxt_)
489+
return X @ self.pxz_ + self.classifier_.intercept_
490+
else:
476491
return [
477492
est_.decision_function(X @ self.pxt_)
478493
for est_ in self.classifier_.estimators_
479494
]
480-
481-
# Or self.classifier_.decision_function(X @ self.pxt_)
482-
return X @ self.pxz_ + self.classifier_.intercept_
483495
else:
484496
T = check_array(T)
485497

486-
if isinstance(self.classifier_, MultiOutputClassifier):
498+
if self.n_outputs == 1:
499+
return T @ self.ptz_ + self.classifier_.intercept_
500+
else:
487501
return [
488502
est_.decision_function(T) for est_ in self.classifier_.estimators_
489503
]
490504

491-
return T @ self.ptz_ + self.classifier_.intercept_
492-
493505
def predict(self, X=None, T=None):
494506
"""Predicts the property labels using classification on T."""
495507
check_is_fitted(self, attributes=["pxz_", "ptz_"])

tests/test_pcovc.py

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
import numpy as np
55
from sklearn import exceptions
6-
from sklearn.datasets import load_breast_cancer as get_dataset
6+
from sklearn.calibration import LinearSVC
7+
from sklearn.datasets import load_iris as get_dataset
78
from sklearn.decomposition import PCA
89
from sklearn.linear_model import LogisticRegression, RidgeClassifier
910
from sklearn.svm import LinearSVC
@@ -98,7 +99,7 @@ def test_simple_prediction(self):
9899
Yp = pcovc.predict(self.X)
99100

100101
self.assertLessEqual(
101-
np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yp) ** 2.0,
102+
np.linalg.norm(Yp - Yhat) ** 2.0 / np.linalg.norm(Yhat) ** 2.0,
102103
self.error_tol,
103104
)
104105

@@ -580,15 +581,56 @@ def test_incompatible_coef_shape(self):
580581

581582
class PCovCMultiOutputTest(PCovCBaseTest):
582583

583-
def test_projector_shapes(self):
584-
pass
584+
def test_prefit_multioutput(self):
585+
"""Check that PCovC works if a prefit classifier is passed when `n_ouputs > 1`."""
586+
classifier = MultiOutputClassifier(estimator=LogisticRegression())
587+
Y_double = np.column_stack((self.Y, self.Y))
585588

586-
def test_decision_function(self):
589+
classifier.fit(self.X, Y_double)
590+
pcovc = self.model(mixing=0.25, classifier=classifier)
591+
pcovc.fit(self.X, Y_double)
592+
593+
W_classifier = np.hstack([est_.coef_.T for est_ in classifier.estimators_])
594+
Z_classifier = self.X @ W_classifier
595+
596+
W_pcovc = np.hstack([est_.coef_.T for est_ in pcovc.z_classifier_.estimators_])
597+
Z_pcovc = self.X @ W_pcovc
598+
599+
self.assertTrue(np.allclose(Z_classifier, Z_pcovc))
600+
self.assertTrue(np.allclose(W_classifier, W_pcovc))
601+
602+
def test_precomputed_multioutput(self):
603+
"""Check that PCovC works if classifier=`precomputed` and `n_ouputs > 1`."""
604+
classifier = MultiOutputClassifier(estimator=LogisticRegression())
605+
Y_double = np.column_stack((self.Y, self.Y))
606+
607+
classifier.fit(self.X, Y_double)
608+
W = np.hstack([est_.coef_.T for est_ in classifier.estimators_])
609+
pcovc1 = self.model(mixing=0.5, classifier="precomputed", n_components=1)
610+
pcovc1.fit(self.X, Y_double, W)
611+
t1 = pcovc1.transform(self.X)
612+
613+
pcovc2 = self.model(mixing=0.5, classifier=classifier, n_components=1)
614+
pcovc2.fit(self.X, Y_double)
615+
t2 = pcovc2.transform(self.X)
616+
617+
self.assertTrue(np.linalg.norm(t1 - t2) < self.error_tol)
618+
619+
# Now check for match when W is not passed:
620+
pcovc3 = self.model(mixing=0.5, classifier="precomputed", n_components=1)
621+
pcovc3.fit(self.X, Y_double)
622+
t3 = pcovc3.transform(self.X)
623+
624+
self.assertTrue(np.linalg.norm(t3 - t2) < self.error_tol)
625+
self.assertTrue(np.linalg.norm(t3 - t1) < self.error_tol)
626+
627+
def test_Z_shape_multioutput(self):
628+
"""Check that PCovC returns the evidence Z in the desired form when `n_ouputs > 1`."""
587629
pcovc = PCovC(
588630
classifier=MultiOutputClassifier(LogisticRegression()), n_components=2
589631
)
590632

591-
Y_double = np.column_stack((self.Y, self.Y[::-1]))
633+
Y_double = np.column_stack((self.Y, self.Y))
592634
pcovc.fit(self.X, Y_double)
593635

594636
Z = pcovc.decision_function(self.X)
@@ -602,6 +644,20 @@ def test_decision_function(self):
602644
self.assertEqual(self.X.shape[0], z_slice.shape[0])
603645
self.assertEqual(est.coef_.shape[0], z_slice.shape[1])
604646

647+
def test_decision_function_multioutput(self):
648+
"""Check that PCovC's decision_function works in edge cases when `n_ouputs > 1`."""
649+
pcovc = self.model(classifier=MultiOutputClassifier(estimator=LinearSVC()))
650+
pcovc.fit(self.X, np.column_stack((self.Y, self.Y)))
651+
with self.assertRaises(ValueError) as cm:
652+
_ = pcovc.decision_function()
653+
self.assertEqual(
654+
str(cm.exception),
655+
"Either X or T must be supplied.",
656+
)
657+
658+
T = pcovc.transform(self.X)
659+
_ = pcovc.decision_function(T=T)
660+
605661

606662
if __name__ == "__main__":
607663
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)