#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: oesteban
# @Date: 2015-11-19 16:44:27
"""
Cross-validation helper
^^^^^^^^^^^^^^^^^^^^^^^
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
from datetime import datetime
import numpy as np
import pandas as pd
import re
from pkg_resources import resource_filename as pkgrf
# sklearn overrides
from .sklearn import preprocessing as mcsp
from .sklearn._split import (RobustLeavePGroupsOut as LeavePGroupsOut,
RepeatedBalancedKFold, RepeatedPartiallyHeldOutKFold)
from .sklearn._validation import cross_val_score, permutation_test_score
# sklearn module
from sklearn import metrics as slm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics.scorer import check_scoring
from sklearn.model_selection import (RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV,
PredefinedSplit)
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
# xgboost
from xgboost import XGBClassifier
from .. import __version__, logging
from .data import read_dataset, get_bids_cols
from ..viz.misc import plot_roc_curve
from builtins import object
LOG = logging.getLogger('mriqc.classifier')
LOG.setLevel(logging.INFO)
FEATURE_NORM = [
'cjv', 'cnr', 'efc', 'fber', 'fwhm_avg', 'fwhm_x', 'fwhm_y', 'fwhm_z',
'snr_csf', 'snr_gm', 'snr_total', 'snr_wm', 'snrd_csf', 'snrd_gm', 'snrd_total', 'snrd_wm',
'summary_csf_mad', 'summary_csf_mean', 'summary_csf_median',
'summary_csf_p05', 'summary_csf_p95', 'summary_csf_stdv',
'summary_gm_k', 'summary_gm_mad', 'summary_gm_mean', 'summary_gm_median',
'summary_gm_p05', 'summary_gm_p95', 'summary_gm_stdv',
'summary_wm_k', 'summary_wm_mad', 'summary_wm_mean', 'summary_wm_median',
'summary_wm_p05', 'summary_wm_p95', 'summary_wm_stdv'
]
[docs]class CVHelperBase(object):
"""
A base helper to build cross-validation schemes
"""
def __init__(self, X, Y, param_file=None, n_jobs=-1, site_label='site',
rate_label=None, rate_selection='random',
scorer='roc_auc', multiclass=False, verbosity=0, debug=False):
# Initialize some values
self._param_file = param_file
self.n_jobs = n_jobs
self._rate_column = rate_label
self._site_column = site_label
self._multiclass = multiclass
self._debug = debug
if rate_label is None:
rate_label = ['rater_1', 'rater_2']
self._rate_column = rate_label[0]
self._Xtrain, self._ftnames = read_dataset(
X, Y, rate_label=rate_label, rate_selection=rate_selection,
binarize=not self._multiclass)
self.sites = list(set(self._Xtrain[site_label].values.ravel()))
self._scorer = scorer
self._balanced_leaveout = True
self._verbosity = verbosity
@property
def ftnames(self):
return self._ftnames
@property
def rate_column(self):
return self._rate_column
[docs] def fit(self):
raise NotImplementedError
[docs] def predict_dataset(self, data, thres=0.5, save_pred=False, site=None):
raise NotImplementedError
[docs] def predict(self, X, thres=0.5, return_proba=True):
raise NotImplementedError
[docs]class CVHelper(CVHelperBase):
def __init__(self, X=None, Y=None, load_clf=None, param_file=None, n_jobs=-1,
site_label='site', rate_label=None, scorer='roc_auc',
b_leaveout=False, multiclass=False, verbosity=0, split='kfold',
debug=False, model='rfc', basename=None, nested_cv=False,
nested_cv_kfold=False, permutation_test=0):
if (X is None or Y is None) and load_clf is None:
raise RuntimeError('Either load_clf or X & Y should be supplied')
self._estimator = None
self._Xtest = None
self._pickled = False
self._batch_effect = None
self._split = split
self._leaveout = b_leaveout
self._model = model
self._base_name = basename
self._nestedcv = nested_cv
self._nestedcv_kfold = nested_cv_kfold
self._permutation_test = permutation_test
if load_clf is not None:
self.n_jobs = n_jobs
self.load(load_clf)
self._rate_column = rate_label[0]
self._multiclass = multiclass
self._base_name = basename[:24]
else:
super(CVHelper, self).__init__(
X, Y, param_file=param_file, n_jobs=n_jobs,
site_label=site_label, rate_label=rate_label, scorer=scorer,
multiclass=multiclass, verbosity=verbosity, debug=debug)
@property
def estimator(self):
return self._estimator
@property
def Xtest(self):
return self._Xtest
[docs] def setXtest(self, X, Y):
self._Xtest, _ = read_dataset(X, Y, rate_label=self._rate_column,
binarize=not self._multiclass)
if 'site' not in self._Xtest.columns.ravel().tolist():
self._Xtest['site'] = ['TestSite'] * len(self._Xtest)
def _gen_fname(self, suffix=None, ext=None):
if ext is None:
ext = ''
if suffix is None:
suffix = ''
if not ext.startswith('.'):
ext = '.' + ext
if not suffix.startswith('_'):
suffix = '_' + suffix
return self._base_name + suffix + ext
def _get_model(self):
if self._model == 'xgb':
return XGBClassifier()
if self._model == 'svc_rbf':
return SVC()
if self._model == 'svc_lin':
return LinearSVC()
return RFC()
[docs] def fit(self):
"""
Fits the cross-validation helper
"""
if self._pickled:
LOG.info('Classifier was loaded from file, cancelling fitting.')
return
if self._leaveout:
raise NotImplementedError
LOG.info('CV [Setting up pipeline] - scorer: %s', self._scorer)
feat_sel = self._ftnames + ['site']
steps = [
('std', mcsp.BatchRobustScaler(
by='site', columns=[ft for ft in self._ftnames if ft in FEATURE_NORM])),
('sel_cols', mcsp.PandasAdaptor(columns=self._ftnames + ['site'])),
('ft_sites', mcsp.SiteCorrelationSelector()),
('ft_noise', mcsp.CustFsNoiseWinnow()),
(self._model, self._get_model())
]
if self._multiclass:
# If multiclass: binarize labels and wrap classifier
steps.insert(3, ('bin', LabelBinarizer()))
steps[-1] = (steps[-1][0], OneVsRestClassifier(steps[-1][1]))
pipe = Pipeline(steps)
# Prepare data splits for CV
fit_args = {}
if self._split == 'kfold':
kf_params = {} if not self._debug else {'n_splits': 2, 'n_repeats': 1}
splits = RepeatedStratifiedKFold(**kf_params)
elif self._split == 'loso':
splits = LeavePGroupsOut(n_groups=1)
elif self._split == 'balanced-kfold':
kf_params = {'n_splits': 10, 'n_repeats': 3}
if self._debug:
kf_params = {'n_splits': 3, 'n_repeats': 1}
splits = RepeatedBalancedKFold(**kf_params)
elif self._split == 'batch':
# Get test label
test_site = list(set(self._Xtest.site.values.ravel().tolist()))[0]
# Merge test and train
self._Xtrain = pd.concat((self._Xtrain, self._Xtest), axis=0)
test_mask = self._Xtrain.site.values.ravel() == test_site
kf_params = {'n_splits': 5, 'n_repeats': 1}
if self._debug:
kf_params = {'n_splits': 3, 'n_repeats': 1}
kf_params['groups'] = test_mask.astype(int).tolist()
splits = RepeatedPartiallyHeldOutKFold(**kf_params)
train_y = self._Xtrain[[self._rate_column]].values.ravel().tolist()
grid = RandomizedSearchCV(
pipe, self._get_params_dist(),
n_iter=1 if self._debug else 50,
error_score=0.5,
refit=True,
scoring=check_scoring(pipe, scoring=self._scorer),
n_jobs=self.n_jobs,
cv=splits,
verbose=self._verbosity)
if self._nestedcv or self._nestedcv_kfold:
outer_cv = LeavePGroupsOut(n_groups=1)
if self._nestedcv_kfold:
outer_cv = RepeatedStratifiedKFold(n_repeats=1, n_splits=10)
n_iter = 32 if self._model in ['svc_lin', 'xgb'] else 50
grid = RandomizedSearchCV(
pipe, self._get_params_dist(),
n_iter=n_iter if not self._debug else 1,
error_score=0.5,
refit=True,
scoring=check_scoring(pipe, scoring=self._scorer),
n_jobs=self.n_jobs,
cv=splits,
verbose=self._verbosity)
nested_score, group_order = cross_val_score(
grid,
X=self._Xtrain,
y=train_y,
cv=outer_cv,
scoring=['roc_auc', 'accuracy'],
)
nested_means = np.average(nested_score, axis=0)
nested_std = np.std(nested_score, axis=0)
LOG.info('Nested CV [avg] %s=%.3f (+/-%.3f), accuracy=%.3f (+/-%.3f)', self._scorer,
nested_means[0], nested_std[0], nested_means[1], nested_std[1])
LOG.info('Nested CV %s=%s.', self._scorer,
', '.join('%.3f' % v for v in nested_score[:, 0].tolist()))
LOG.info('Nested CV accuracy=%s.',
', '.join('%.3f' % v for v in nested_score[:, 1].tolist()))
LOG.info('Nested CV groups=%s', group_order)
else:
grid = GridSearchCV(
pipe, self._get_params(),
error_score=0.5,
refit=True,
scoring=check_scoring(pipe, scoring=self._scorer),
n_jobs=self.n_jobs,
cv=splits,
verbose=self._verbosity)
grid.fit(self._Xtrain, train_y, **fit_args)
np.savez(os.path.abspath(self._gen_fname(suffix='cvres', ext='npz')),
cv_results=grid.cv_results_)
best_pos = np.argmin(grid.cv_results_['rank_test_score'])
# Save estimator and get its parameters
self._estimator = grid.best_estimator_
cvparams = self._estimator.get_params()
LOG.info('CV [Best model] %s=%s, mean=%.3f, std=%.3f.',
self._scorer, grid.best_score_,
grid.cv_results_['mean_test_score'][best_pos],
grid.cv_results_['std_test_score'][best_pos],
)
LOG.log(18, 'CV [Best model] parameters\n%s', cvparams)
if cvparams.get(self._model + '__oob_score', False):
LOG.info('CV [Best model] OOB %s=%.3f', self._scorer,
self._estimator.named_steps[self._model].oob_score_)
# Report preprocessing selections
prep_msg = ' * Robust scaling (centering): %s.\n' % (
'enabled' if cvparams['std__with_centering'] else 'disabled')
prep_msg += ' * Robust scaling (scaling): %s.\n' % (
'enabled' if cvparams['std__with_scaling'] else 'disabled')
prep_msg += ' * SiteCorrelation feature selection: %s.\n' % (
'disabled' if cvparams['ft_sites__disable'] else 'enabled')
prep_msg += ' * Winnow feature selection: %s.\n' % (
'disabled' if cvparams['ft_noise__disable'] else 'enabled')
selected = np.array(feat_sel).copy()
if not cvparams['ft_sites__disable']:
sitesmask = self._estimator.named_steps['ft_sites'].mask_
selected = self._Xtrain[feat_sel].columns.ravel()[sitesmask]
if not cvparams['ft_noise__disable']:
winnowmask = self._estimator.named_steps['ft_noise'].mask_
selected = selected[winnowmask]
selected = selected.tolist()
if 'site' in selected:
selected.remove('site')
LOG.info('CV [Preprocessing]:\n%s * Features selected: %s.',
prep_msg, ', '.join(['"%s"' % f for f in selected]))
# If leaveout, test and refit
if self._leaveout:
self._fit_leaveout(leaveout_x, leaveout_y)
return self
def _fit_leaveout(self, leaveout_x, leaveout_y):
target_names = ['accept', 'exclude']
if self._multiclass:
target_names = ['exclude', 'doubtful', 'accept']
LOG.info('Testing on left-out, balanced subset ...')
# Predict
_, pred_y = self.predict(leaveout_x)
LOG.info('Classification report:\n%s',
slm.classification_report(leaveout_y, pred_y,
target_names=target_names))
score = self._score(leaveout_x, leaveout_y)
LOG.info('Performance on balanced left-out (%s=%f)', self._scorer, score)
# Rewrite clf
LOG.info('Fitting full model (train + balanced left-out) ...')
# Features may change the robust normalization
# self._estimator.rfc__warm_start = True
test_yall = self._Xtrain[[self._rate_column]].values.ravel().tolist()
if self._multiclass:
test_yall = LabelBinarizer().fit_transform(test_yall)
self._estimator = self._estimator.fit(self._Xtrain, test_yall)
LOG.info('Testing on left-out with full model, balanced subset ...')
_, pred_y = self.predict(leaveout_x)
LOG.info('Classification report:\n%s',
slm.classification_report(leaveout_y, pred_y,
target_names=target_names))
score = self._score(leaveout_x, leaveout_y)
LOG.info('Performance on balanced left-out (%s=%f)', self._scorer, score)
[docs] def fit_full(self):
"""
Completes the training of the model with the examples
from the left-out dataset
"""
if self._estimator is None:
raise RuntimeError('Model should be fit first')
target_names = ["accept", "exclude"]
X = pd.concat([self._Xtrain, self._Xtest], axis=0)
labels_y = X[[self._rate_column]].values.ravel().tolist()
if self._multiclass:
labels_y = LabelBinarizer().fit_transform(labels_y)
target_names = ["exclude", "doubtful", "accept"]
LOG.info('Fitting full model ...')
self._estimator = self._estimator.fit(X, labels_y)
LOG.info('Testing on left-out with full model')
pred_y = self._estimator.predict(X)
LOG.info('Classification report:\n%s',
slm.classification_report(labels_y, pred_y,
target_names=target_names))
score = self._score(X, labels_y)
LOG.info('Full model performance on left-out (%s=%f)', self._scorer, score)
[docs] def evaluate(self, scoring=None, matrix=False, save_roc=False,
save_pred=False):
"""
Evaluate the internal estimator on the test data
"""
if scoring is None:
scoring = ['accuracy']
LOG.info('Testing on evaluation (left-out) dataset ...')
test_y = self._Xtest[[self._rate_column]].values.ravel()
target_names = ["accept", "exclude"]
if self._multiclass:
target_names = ["exclude", "doubtful", "accept"]
test_y = LabelBinarizer().fit_transform(test_y)
prob_y, pred_y = self.predict(self._Xtest)
scores = [self._score(self._Xtest, test_y, scoring=s) for s in scoring]
LOG.info('Performance on evaluation set (%s)',
', '.join(['%s=%.3f' % (n, s) for n, s in zip(scoring, scores)]))
pred_totals = np.sum(pred_y, 0).tolist()
if prob_y.shape[1] <= 2:
pred_totals = [len(pred_y) - pred_totals, pred_totals]
LOG.info('Predictions: %s', ' / '.join((
'%d (%s)' % (n, c) for n, c in zip(pred_totals, target_names))))
if matrix:
LOG.info(
'Confusion matrix:\n%s', slm.confusion_matrix(
test_y, pred_y))
LOG.info(
'Classification report:\n%s', slm.classification_report(
test_y, pred_y, target_names=target_names))
if save_pred:
self._save_pred_table(self._Xtest, prob_y, pred_y,
suffix='data-test_pred')
if save_roc:
plot_roc_curve(self._Xtest[[self._rate_column]].values.ravel(), prob_y,
self._gen_fname(suffix='data-test_roc', ext='png'))
# Run a permutation test
if self._permutation_test:
# Merge test and train
concatenated_x = pd.concat((self._Xtrain, self._Xtest), axis=0)
concatenated_y = concatenated_x[[self._rate_column]].values.ravel().tolist()
test_fold = [-1] * len(self._Xtrain) + [0] * len(self._Xtest)
permutation_scores = permutation_test_score(
self._estimator, concatenated_x, concatenated_y,
scoring='accuracy', cv=PredefinedSplit(test_fold),
n_permutations=self._permutation_test, n_jobs=1)
score = scores[scoring.index('accuracy')]
pvalue = (np.sum(permutation_scores >=
score) + 1.0) / (self._permutation_test + 1)
LOG.info('Permutation test (N=%d) for accuracy score %f (pvalue=%f)',
self._permutation_test, score, pvalue)
return scores
[docs] def predict(self, X, thres=0.5, return_proba=True):
"""
Predict class for X.
The predicted class of an input sample is a vote by the trees in
the forest, weighted by their probability estimates. That is,
the predicted class is the one with highest mean probability
estimate across the trees.
"""
if self._model == 'svc_lin':
from sklearn.base import clone
from sklearn.calibration import CalibratedClassifierCV
clf = CalibratedClassifierCV(clone(self._estimator).set_param(
**self._estimator.get_param()))
train_y = self._Xtrain[[self._rate_column]].values.ravel().tolist()
self._estimator = clf.fit(self._Xtrain, train_y)
proba = np.array(self._estimator.predict_proba(X))
if proba.shape[1] > 2:
pred = (proba > thres).astype(int)
else:
pred = (proba[:, 1] > thres).astype(int)
if return_proba:
return proba, pred
return pred
[docs] def predict_dataset(self, data, thres=0.5, save_pred=False, site=None):
from .data import read_iqms
_xeval, _, _ = read_iqms(data)
if site is None:
site = 'unseen'
columns = _xeval.columns.ravel().tolist()
if 'site' not in columns:
_xeval['site'] = [site] * len(_xeval)
columns.append('site')
# Classifier is trained with rate_1 as last column
if 'rate_1' not in columns:
_xeval['rate_1'] = [np.nan] * len(_xeval)
columns.append('rate_1')
prob_y, pred_y = self.predict(_xeval[columns])
if save_pred:
self._save_pred_table(_xeval, prob_y, pred_y,
suffix='data-%s_pred' % site)
return pred_y
def _save_pred_table(self, sample, prob_y, pred_y, suffix):
bidts = get_bids_cols(sample)
predf = sample[bidts].copy()
if self._multiclass:
probs = ['proba_%d' % i
for i in list(range(prob_y.shape[1]))]
predf['pred_y'] = (np.argmax(pred_y, axis=1) - 1).astype(int)
for i, col in enumerate(probs):
predf[col] = prob_y[:, i]
cols = probs + ['pred_y']
else:
cols = ['prob_y', 'pred_y']
predf['prob_y'] = prob_y[:, 1]
predf['pred_y'] = pred_y
predf[bidts + cols].to_csv(
self._gen_fname(suffix=suffix, ext='csv'),
index=False)
[docs] def save(self, suffix='estimator', compress=3):
"""
Pickle the estimator, adding the feature names
http://scikit-learn.org/stable/modules/model_persistence.html
"""
from sklearn.externals.joblib import dump as savepkl
# Store ftnames
setattr(self._estimator, '_ftnames', self._ftnames)
# Store normalization medians
setattr(self._estimator, '_batch_effect', self._batch_effect)
filehandler = os.path.abspath(
self._gen_fname(suffix=suffix, ext='pklz'))
LOG.info('Saving classifier to: %s', filehandler)
savepkl(self._estimator, filehandler, compress=compress)
[docs] def load(self, filehandler):
"""
UnPickle the estimator, adding the feature names
http://scikit-learn.org/stable/modules/model_persistence.html
"""
from sklearn.externals.joblib import load as loadpkl
self._estimator = loadpkl(filehandler)
self._ftnames = getattr(self._estimator, '_ftnames')
self._batch_effect = getattr(self._estimator, '_batch_effect', None)
self._pickled = True
def _score(self, X, y, scoring=None, clf=None):
from sklearn.model_selection._validation import _score
if scoring is None:
scoring = self._scorer
if clf is None:
clf = self._estimator
return _score(clf, X, y, check_scoring(clf, scoring=scoring))
def _get_params(self):
# Some baseline parameters
baseparam = {
'std__by': ['site'],
'std__columns': [[ft for ft in self._ftnames if ft in FEATURE_NORM]],
'sel_cols__columns': [self._ftnames + ['site']],
}
# Load in classifier parameters
clfparams = _load_parameters(
(pkgrf('mriqc', 'data/classifier_settings.yml')
if self._param_file is None else self._param_file)
)
# Read preprocessing parameters
if 'preproc' in clfparams:
preparams = []
for el in clfparams['preproc']:
pcombination = {}
for pref, subel in list(el.items()):
for k, v in list(subel.items()):
pcombination[pref + '__' + k] = v
preparams.append(pcombination)
else:
preparams = [{
'std__with_centering': [True],
'std__with_scaling': [True],
'ft_sites__disable': [False],
'ft_noise__disable': [False],
}]
# Set base parameters
preparams = [{**baseparam, **prep} for prep in preparams]
# Extract this model parameters
prefix = self._model + '__'
if self._multiclass:
prefix += 'estimator__'
modparams = {prefix + k: v for k, v in list(clfparams[self._model][0].items())}
# Merge model parameters + preprocessing
modparams = [{**prep, **modparams} for prep in preparams]
# Evaluate just one model if debug
if self._debug:
modparams = {k: [v[0]] for k, v in list(modparams.items())}
return modparams
def _get_params_dist(self):
preparams = {
'std__by': ['site'],
'std__with_centering': [True, False],
'std__with_scaling': [True, False],
'std__columns': [[ft for ft in self._ftnames if ft in FEATURE_NORM]],
'sel_cols__columns': [self._ftnames + ['site']],
'ft_sites__disable': [False, True],
'ft_noise__disable': [False, True],
}
prefix = self._model + '__'
if self._multiclass:
prefix += 'estimator__'
clfparams = _load_parameters(
(pkgrf('mriqc', 'data/model_selection.yml')
if self._param_file is None else self._param_file)
)
modparams = {prefix + k: v for k, v in list(clfparams[self._model][0].items())}
if self._debug:
preparams = {
'std__by': ['site'],
'std__with_centering': [True],
'std__with_scaling': [True],
'std__columns': [[ft for ft in self._ftnames if ft in FEATURE_NORM]],
'sel_cols__columns': [self._ftnames + ['site']],
'ft_sites__disable': [True],
'ft_noise__disable': [True],
}
modparams = {k: [v[0]] for k, v in list(modparams.items())}
return {**preparams, **modparams}
def _load_parameters(param_file):
"""Load parameters from file"""
import yaml
from io import open
with open(param_file) as paramfile:
parameters = yaml.load(paramfile)
return parameters