# License: BSD 3 clause
"""
Provides easy-to-use wrapper around scikit-learn.
:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:author: Aoife Cahill (acahill@ets.org)
:organization: ETS
"""
# pylint: disable=F0401,W0622,E1002,E1101
from __future__ import absolute_import, print_function, unicode_literals
import copy
import inspect
import logging
import os
import sys
from collections import Counter, defaultdict
from functools import wraps
from importlib import import_module
from multiprocessing import cpu_count
import joblib
import numpy as np
import scipy.sparse as sp
from six import iteritems, itervalues
from six import string_types
from six.moves import xrange as range
from six.moves import zip
from sklearn.model_selection import (GridSearchCV,
KFold,
LeaveOneGroupOut,
ShuffleSplit,
StratifiedKFold)
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import (AdaBoostClassifier,
AdaBoostRegressor,
GradientBoostingClassifier,
GradientBoostingRegressor,
RandomForestClassifier,
RandomForestRegressor)
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import SelectKBest
from sklearn.utils.multiclass import type_of_target
# AdditiveChi2Sampler is used indirectly, so ignore linting message
from sklearn.kernel_approximation import (Nystroem,
RBFSampler,
SkewedChi2Sampler)
from sklearn.linear_model import (BayesianRidge,
ElasticNet,
HuberRegressor,
Lars,
Lasso,
LinearRegression,
LogisticRegression,
RANSACRegressor,
Ridge,
RidgeClassifier,
SGDClassifier,
SGDRegressor,
TheilSenRegressor)
from sklearn.linear_model.base import LinearModel
from sklearn.metrics import (accuracy_score,
confusion_matrix,
precision_recall_fscore_support)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle as sk_shuffle
from skll.data import FeatureSet
from skll.metrics import _CORRELATION_METRICS, use_score_func
from skll.version import VERSION
# Constants #
_DEFAULT_PARAM_GRIDS = {AdaBoostClassifier:
[{'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}],
AdaBoostRegressor:
[{'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}],
BayesianRidge:
[{'alpha_1': [1e-6, 1e-4, 1e-2, 1, 10],
'alpha_2': [1e-6, 1e-4, 1e-2, 1, 10],
'lambda_1': [1e-6, 1e-4, 1e-2, 1, 10],
'lambda_2': [1e-6, 1e-4, 1e-2, 1, 10]}],
DecisionTreeClassifier:
[{'max_features': ["auto", None]}],
DecisionTreeRegressor:
[{'max_features': ["auto", None]}],
DummyClassifier:
[{}],
DummyRegressor:
[{}],
ElasticNet:
[{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
GradientBoostingClassifier:
[{'max_depth': [1, 3, 5]}],
GradientBoostingRegressor:
[{'max_depth': [1, 3, 5]}],
HuberRegressor:
[{'epsilon': [1.05, 1.35, 1.5, 2.0, 2.5, 5.0],
'alpha': [1e-4, 1e-3, 1e-3, 1e-1, 1, 10, 100, 1000]}],
KNeighborsClassifier:
[{'n_neighbors': [1, 5, 10, 100],
'weights': ['uniform', 'distance']}],
KNeighborsRegressor:
[{'n_neighbors': [1, 5, 10, 100],
'weights': ['uniform', 'distance']}],
MLPClassifier:
[{'activation': ['logistic', 'tanh', 'relu'],
'alpha': [1e-4, 1e-3, 1e-3, 1e-1, 1],
'learning_rate_init': [0.001, 0.01, 0.1]}],
MLPRegressor:
[{'activation': ['logistic', 'tanh', 'relu'],
'alpha': [1e-4, 1e-3, 1e-3, 1e-1, 1],
'learning_rate_init': [0.001, 0.01, 0.1]}],
MultinomialNB:
[{'alpha': [0.1, 0.25, 0.5, 0.75, 1.0]}],
Lars:
[{}],
Lasso:
[{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
LinearRegression:
[{}],
LinearSVC:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
LogisticRegression:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
SVC:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'gamma': ['auto', 0.01, 0.1, 1.0, 10.0, 100.0]}],
RandomForestClassifier:
[{'max_depth': [1, 5, 10, None]}],
RandomForestRegressor:
[{'max_depth': [1, 5, 10, None]}],
RANSACRegressor:
[{}],
Ridge:
[{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
RidgeClassifier:
[{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
SGDClassifier:
[{'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
'penalty': ['l1', 'l2', 'elasticnet']}],
SGDRegressor:
[{'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
'penalty': ['l1', 'l2', 'elasticnet']}],
LinearSVR:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
SVR:
[{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
'gamma': ['auto', 0.01, 0.1, 1.0, 10.0, 100.0]}],
TheilSenRegressor:
[{}]}
# list of valid grid objective functions for regression and classification
# models depending on type of labels
_BINARY_CLASS_OBJ_FUNCS = frozenset(['unweighted_kappa',
'linear_weighted_kappa',
'quadratic_weighted_kappa',
'uwk_off_by_one',
'lwk_off_by_one',
'qwk_off_by_one',
'kendall_tau',
'pearson',
'spearman',
'neg_log_loss'])
_REGRESSION_ONLY_OBJ_FUNCS = frozenset(['r2',
'neg_mean_squared_error'])
_CLASSIFICATION_ONLY_OBJ_FUNCS = frozenset(['accuracy',
'precision',
'recall',
'f1',
'f1_score_micro',
'f1_score_macro',
'f1_score_weighted',
'f1_score_least_frequent',
'average_precision',
'roc_auc',
'neg_log_loss'])
_INT_CLASS_OBJ_FUNCS = frozenset(['unweighted_kappa',
'linear_weighted_kappa',
'quadratic_weighted_kappa',
'uwk_off_by_one',
'lwk_off_by_one',
'qwk_off_by_one',
'neg_log_loss'])
_REQUIRES_DENSE = (BayesianRidge,
GradientBoostingClassifier,
GradientBoostingRegressor,
Lars,
TheilSenRegressor)
MAX_CONCURRENT_PROCESSES = int(os.getenv('SKLL_MAX_CONCURRENT_PROCESSES', '3'))
# pylint: disable=W0223,R0903
[docs]class FilteredLeaveOneGroupOut(LeaveOneGroupOut):
"""
Version of ``LeaveOneGroupOut`` cross-validation iterator that only outputs
indices of instances with IDs in a prespecified set.
Parameters
----------
keep : set of str
A set of IDs to keep.
example_ids : list of str, of length n_samples
A list of example IDs.
"""
def __init__(self, keep, example_ids):
super(FilteredLeaveOneGroupOut, self).__init__()
self.keep = keep
self.example_ids = example_ids
self._warned = False
[docs] def split(self, X, y, groups):
"""
Generate indices to split data into training and test set.
Parameters
----------
X : array-like, with shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, of length n_samples
The target variable for supervised learning problems.
groups : array-like, with shape (n_samples,)
Group labels for the samples used while splitting the dataset into
train/test set.
Yields
-------
train_index : np.array
The training set indices for that split.
test_index : np.array
The testing set indices for that split.
"""
for train_index, test_index in super(FilteredLeaveOneGroupOut,
self).split(X, y, groups):
train_len = len(train_index)
test_len = len(test_index)
train_index = [i for i in train_index if self.example_ids[i] in
self.keep]
test_index = [i for i in test_index if self.example_ids[i] in
self.keep]
if not self._warned and (train_len != len(train_index) or
test_len != len(test_index)):
self.logger.warning('Feature set contains IDs that are not ' +
'in folds dictionary. Skipping those IDs.')
self._warned = True
yield train_index, test_index
def _find_default_param_grid(cls):
"""
Finds the default parameter grid for the specified classifier.
Parameters
----------
cls
A parent classifier class to check, and find the
default param grid.
Returns
-------
grid : list of dicts or None
The parameters grid for a given classifier.
"""
for key_cls, grid in _DEFAULT_PARAM_GRIDS.items():
if issubclass(cls, key_cls):
return grid
return None
def _import_custom_learner(custom_learner_path, custom_learner_name):
"""
Does the gruntwork of adding the custom model's module to globals.
Parameters
----------
custom_learner_path : str
The path to a custom learner.
custom_learner_name : str
The name of a custom learner.
Raises
------
ValueError
If the custom learner path is None.
ValueError
If the custom learner path does not end in '.py'.
"""
if not custom_learner_path:
raise ValueError('custom_learner_path was not set and learner {} '
'was not found.'.format(custom_learner_name))
if not custom_learner_path.endswith('.py'):
raise ValueError('custom_learner_path must end in .py ({})'
.format(custom_learner_path))
custom_learner_module_name = os.path.basename(custom_learner_path)[:-3]
sys.path.append(os.path.dirname(os.path.abspath(custom_learner_path)))
import_module(custom_learner_module_name)
globals()[custom_learner_name] = \
getattr(sys.modules[custom_learner_module_name], custom_learner_name)
def _train_and_score(learner,
train_examples,
test_examples,
objective='f1_score_micro'):
"""
A utility method to train a given learner instance on the given training examples,
generate predictions on the training set itself and also the given
test set, and score those predictions using the given objective function.
The method returns the train and test scores.
Note that this method needs to be a top-level function since it is
called from within ``joblib.Parallel()`` and, therefore, needs to be
picklable which it would not be as an instancemethod of the ``Learner``
class.
Parameters
----------
learner : skll.Learner
A SKLL ``Learner`` instance.
train_examples : array-like, with shape (n_samples, n_features)
The training examples.
test_examples : array-like, of length n_samples
The test examples.
objective : str, optional
The objective function passed to ``use_score_func()``.
Defaults to ``'f1_score_micro'``.
Returns
-------
train_score : float
Output of the score function applied to predictions of
``learner`` on ``train_examples``.
test_score : float
Output of the score function applied to predictions of
``learner`` on ``test_examples``.
"""
_ = learner.train(train_examples, grid_search=False, shuffle=False)
train_predictions = learner.predict(train_examples)
test_predictions = learner.predict(test_examples)
if learner.model_type._estimator_type == 'classifier':
test_label_list = np.unique(test_examples.labels).tolist()
unseen_test_label_list = [label for label in test_label_list
if not label in learner.label_list]
unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list,
start=len(learner.label_list))}
# combine the two dictionaries
train_and_test_label_dict = learner.label_dict.copy()
train_and_test_label_dict.update(unseen_label_dict)
train_labels = np.array([train_and_test_label_dict[label]
for label in train_examples.labels])
test_labels = np.array([train_and_test_label_dict[label]
for label in test_examples.labels])
else:
train_labels = train_examples.labels
test_labels = test_examples.labels
train_score = use_score_func(objective, train_labels, train_predictions)
test_score = use_score_func(objective, test_labels, test_predictions)
return train_score, test_score
def _predict_binary(self, X):
"""
A helper function to allow us to use ``GridSearchCV`` with objective
functions like Kendall's tau for binary classification problems (where the
probability of the true class is used as the input to the objective
function).
This only works if we've also taken the step of storing the old predict
function for ``self`` as ``predict_normal``. It's kind of a hack, but it saves
us from having to override ``GridSearchCV`` to change one little line.
Parameters
----------
X : array-like
A set of examples to predict values for.
Returns
-------
res : array-like
The prediction results.
"""
if self.coef_.shape[0] == 1:
res = self.predict_proba(X)[:, 1]
else:
res = self.predict_normal(X)
return res
[docs]class SelectByMinCount(SelectKBest):
"""
Select features occurring in more (and/or fewer than) than a specified
number of examples in the training data (or a CV training fold).
Parameters
----------
min_count : int, optional
The minimum feature count to select.
Defaults to 1.
"""
def __init__(self, min_count=1):
self.min_count = min_count
self.scores_ = None
[docs] def fit(self, X, y=None):
"""
Fit the SelectByMinCount model.
Parameters
----------
X : array-like, with shape (n_samples, n_features)
The training data to fit.
y : Ignored
Returns
-------
self
"""
# initialize a list of counts of times each feature appears
col_counts = [0 for _ in range(X.shape[1])]
if sp.issparse(X):
# find() is scipy.sparse's equivalent of nonzero()
_, col_indices, _ = sp.find(X)
else:
# assume it's a numpy array (not a numpy matrix)
col_indices = X.nonzero()[1].tolist()
for i in col_indices:
col_counts[i] += 1
self.scores_ = np.array(col_counts)
return self
def _get_support_mask(self):
"""
Returns an indication of which features to keep.
Adapted from ``SelectKBest``.
Returns
-------
mask : np.array
The mask with features to keep set to True.
"""
mask = np.zeros(self.scores_.shape, dtype=bool)
mask[self.scores_ >= self.min_count] = True
return mask
[docs]def rescaled(cls):
"""
Decorator to create regressors that store a min and a max for the training
data and make sure that predictions fall within that range. It also stores
the means and SDs of the gold standard and the predictions on the training
set to rescale the predictions (e.g., as in e-rater).
Parameters
----------
cls : BaseEstimator
An estimator class to add rescaling to.
Returns
-------
cls : BaseEstimator
Modified version of estimator class with rescaled functions added.
Raises
------
ValueError
If classifier cannot be rescaled (i.e. is not a regressor).
"""
# If this class has already been run through the decorator, return it
if hasattr(cls, 'rescale'):
return cls
# Save original versions of functions to use later.
orig_init = cls.__init__
orig_fit = cls.fit
orig_predict = cls.predict
if cls._estimator_type == 'classifier':
raise ValueError('Classifiers cannot be rescaled. ' +
'Only regressors can.')
# Define all new versions of functions
@wraps(cls.fit)
def fit(self, X, y=None):
"""
Fit a model, then store the mean, SD, max and min of the training set
and the mean and SD of the predictions on the training set.
Parameters
----------
X : array-like, with shape (n_samples, n_features)
The data to fit.
y : Ignored
Returns
-------
self
"""
# fit a regular regression model
orig_fit(self, X, y=y)
if self.constrain:
# also record the training data min and max
self.y_min = min(y)
self.y_max = max(y)
if self.rescale:
# also record the means and SDs for the training set
y_hat = orig_predict(self, X)
self.yhat_mean = np.mean(y_hat)
self.yhat_sd = np.std(y_hat)
self.y_mean = np.mean(y)
self.y_sd = np.std(y)
return self
@wraps(cls.predict)
def predict(self, X):
"""
Make predictions with the super class, and then adjust them using the
stored min, max, means, and standard deviations.
Parameters
----------
X : array-like, with shape (n_samples,)
The data to predict.
Returns
-------
res : array-like
The prediction results.
"""
# get the unconstrained predictions
res = orig_predict(self, X)
if self.rescale:
# convert the predictions to z-scores,
# then rescale to match the training set distribution
res = (((res - self.yhat_mean) / self.yhat_sd)
* self.y_sd) + self.y_mean
if self.constrain:
# apply min and max constraints
res = np.array([max(self.y_min, min(self.y_max, pred))
for pred in res])
return res
@classmethod
@wraps(cls._get_param_names)
def _get_param_names(class_x):
"""
This is adapted from scikit-learns's ``BaseEstimator`` class.
It gets the kwargs for the superclass's init method and adds the
kwargs for newly added ``__init__()`` method.
Parameters
----------
class_x
The the superclass from which to retrieve param names.
Returns
-------
args : list
A list of parameter names for the class's init method.
Raises
------
RunTimeError
If `varargs` exist in the scikit-learn estimator.
"""
try:
init = getattr(orig_init, 'deprecated_original', orig_init)
args, varargs, _, _ = inspect.getargspec(init)
if varargs is not None:
raise RuntimeError('scikit-learn estimators should always '
'specify their parameters in the signature'
' of their init (no varargs).')
# Remove 'self'
args.pop(0)
except TypeError:
args = []
rescale_args = inspect.getargspec(class_x.__init__)[0]
# Remove 'self'
rescale_args.pop(0)
args += rescale_args
args.sort()
return args
@wraps(cls.__init__)
def init(self, constrain=True, rescale=True, **kwargs):
"""
This special init function is used by the decorator to make sure
that things get initialized in the right order.
Parameters
----------
constrain : bool, optional
Whether to constrain predictions within min and max values.
Defaults to True.
rescale : bool, optional
Whether to rescale prediction values using z-scores.
Defaults to True.
kwargs : dict, optional
Arguments for base class.
"""
# pylint: disable=W0201
self.constrain = constrain
self.rescale = rescale
self.y_min = None
self.y_max = None
self.yhat_mean = None
self.yhat_sd = None
self.y_mean = None
self.y_sd = None
orig_init(self, **kwargs)
# Override original functions with new ones
cls.__init__ = init
cls.fit = fit
cls.predict = predict
cls._get_param_names = _get_param_names
cls.rescale = True
# Return modified class
return cls
# Rescaled regressors
@rescaled
class RescaledBayesianRidge(BayesianRidge):
pass
@rescaled
class RescaledAdaBoostRegressor(AdaBoostRegressor):
pass
@rescaled
class RescaledDecisionTreeRegressor(DecisionTreeRegressor):
pass
@rescaled
class RescaledElasticNet(ElasticNet):
pass
@rescaled
class RescaledGradientBoostingRegressor(GradientBoostingRegressor):
pass
@rescaled
class RescaledHuberRegressor(HuberRegressor):
pass
@rescaled
class RescaledKNeighborsRegressor(KNeighborsRegressor):
pass
@rescaled
class RescaledLars(Lars):
pass
@rescaled
class RescaledLasso(Lasso):
pass
@rescaled
class RescaledLinearRegression(LinearRegression):
pass
@rescaled
class RescaledLinearSVR(LinearSVR):
pass
@rescaled
class RescaledMLPRegressor(MLPRegressor):
pass
@rescaled
class RescaledRandomForestRegressor(RandomForestRegressor):
pass
@rescaled
class RescaledRANSACRegressor(RANSACRegressor):
pass
@rescaled
class RescaledRidge(Ridge):
pass
@rescaled
class RescaledSGDRegressor(SGDRegressor):
pass
@rescaled
class RescaledSVR(SVR):
pass
@rescaled
class RescaledTheilSenRegressor(TheilSenRegressor):
pass
[docs]class Learner(object):
"""
A simpler learner interface around many scikit-learn classification
and regression functions.
Parameters
----------
model_type : str
Name of estimator to create (e.g., ``'LogisticRegression'``).
See the skll package documentation for valid options.
probability : bool, optional
Should learner return probabilities of all
labels (instead of just label with highest probability)?
Defaults to ``False``.
feature_scaling : str, optional
How to scale the features, if at all. Options are
- 'with_std': scale features using the standard deviation
- 'with_mean': center features using the mean
- 'both': do both scaling as well as centering
- 'none': do neither scaling nor centering
Defaults to 'none'.
model_kwargs : dict, optional
A dictionary of keyword arguments to pass to the
initializer for the specified model.
Defaults to ``None``.
pos_label_str : str, optional
The string for the positive label in the binary
classification setting. Otherwise, an arbitrary
label is picked.
Defaults to ``None``.
min_feature_count : int, optional
The minimum number of examples a feature
must have a nonzero value in to be included.
Defaults to 1.
sampler : str, optional
The sampler to use for kernel approximation, if desired.
Valid values are
- 'AdditiveChi2Sampler'
- 'Nystroem'
- 'RBFSampler'
- 'SkewedChi2Sampler'
Defaults to ``None``.
sampler_kwargs : dict, optional
A dictionary of keyword arguments to pass to the
initializer for the specified sampler.
Defaults to ``None``.
custom_learner_path : str, optional
Path to module where a custom classifier is defined.
Defaults to ``None``.
logger : logging object, optional
A logging object. If ``None`` is passed, get logger from ``__name__``.
Defaults to ``None``.
"""
def __init__(self, model_type, probability=False, feature_scaling='none',
model_kwargs=None, pos_label_str=None, min_feature_count=1,
sampler=None, sampler_kwargs=None, custom_learner_path=None,
logger=None):
"""
Initializes a learner object with the specified settings.
"""
super(Learner, self).__init__()
self.feat_vectorizer = None
self.scaler = None
self.label_dict = None
self.label_list = None
self.pos_label_str = pos_label_str
self._model = None
self._feature_scaling = feature_scaling
self.feat_selector = None
self._min_feature_count = min_feature_count
self._model_kwargs = {}
self._sampler_kwargs = {}
self.logger = logger if logger else logging.getLogger(__name__)
if model_type not in globals():
# here, we need to import the custom model and add it
# to the appropriate lists of models.
_import_custom_learner(custom_learner_path, model_type)
model_class = globals()[model_type]
default_param_grid = (model_class.default_param_grid()
if hasattr(model_class, 'default_param_grid')
else [{}])
# ewww, globals :-(
global _REQUIRES_DENSE
_DEFAULT_PARAM_GRIDS.update({model_class: default_param_grid})
if hasattr(model_class, 'requires_dense') and \
model_class.requires_dense():
_REQUIRES_DENSE = _REQUIRES_DENSE + (model_class,)
self._model_type = globals()[model_type]
self._probability = None
# Use setter to set self.probability
self.probability = probability
self._use_dense_features = \
(issubclass(self._model_type, _REQUIRES_DENSE) or
self._feature_scaling in {'with_mean', 'both'})
# Set default keyword arguments for models that we have some for.
if issubclass(self._model_type, SVC):
self._model_kwargs['cache_size'] = 1000
self._model_kwargs['probability'] = self.probability
if self.probability:
self.logger.warning('Because LibSVM does an internal '
'cross-validation to produce probabilities, '
'results will not be exactly replicable when '
'using SVC and probability mode.')
elif issubclass(self._model_type,
(RandomForestClassifier, RandomForestRegressor,
GradientBoostingClassifier, GradientBoostingRegressor,
AdaBoostClassifier, AdaBoostRegressor)):
self._model_kwargs['n_estimators'] = 500
elif issubclass(self._model_type, SVR):
self._model_kwargs['cache_size'] = 1000
elif issubclass(self._model_type, SGDClassifier):
self._model_kwargs['loss'] = 'log'
elif issubclass(self._model_type, RANSACRegressor):
self._model_kwargs['loss'] = 'squared_loss'
elif issubclass(self._model_type, (MLPClassifier, MLPRegressor)):
self._model_kwargs['learning_rate'] = 'invscaling'
self._model_kwargs['max_iter'] = 500
if issubclass(self._model_type,
(AdaBoostClassifier, AdaBoostRegressor,
DecisionTreeClassifier, DecisionTreeRegressor,
DummyClassifier, ElasticNet,
GradientBoostingClassifier,
GradientBoostingRegressor, Lasso, LinearSVC,
LinearSVR, LogisticRegression, MLPClassifier,
MLPRegressor, RandomForestClassifier,
RandomForestRegressor, RANSACRegressor, Ridge,
RidgeClassifier, SGDClassifier, SGDRegressor,
SVC, TheilSenRegressor)):
self._model_kwargs['random_state'] = 123456789
if sampler_kwargs:
self._sampler_kwargs.update(sampler_kwargs)
if sampler:
sampler_type = globals()[sampler]
if issubclass(sampler_type, (Nystroem, RBFSampler,
SkewedChi2Sampler)):
self._sampler_kwargs['random_state'] = 123456789
self.sampler = sampler_type(**self._sampler_kwargs)
else:
self.sampler = None
if model_kwargs:
# if the model is an AdaBoost classifier or regressor or
# a RANSAC regressor, then we need to convert any specified
# `base_estimator` (a string) into an object before passing
# it in to the learner constructor. We also need to make sure
# where appropriate, we set the random state to a fixed seed
# such that results are replicable
if issubclass(self._model_type,
(AdaBoostRegressor,
AdaBoostClassifier,
RANSACRegressor)) and ('base_estimator' in model_kwargs):
base_estimator_name = model_kwargs['base_estimator']
base_estimator_kwargs = {} if base_estimator_name in ['LinearRegression',
'MultinomialNB',
'SVR'] else {'random_state': 123456789}
base_estimator = globals()[base_estimator_name](**base_estimator_kwargs)
model_kwargs['base_estimator'] = base_estimator
self._model_kwargs.update(model_kwargs)
[docs] @classmethod
def from_file(cls, learner_path):
"""
Load a saved ``Learner`` instance from a file path.
Parameters
----------
learner_path : str
The path to a saved ``Learner`` instance file.
Returns
-------
learner : skll.Learner
The ``Learner`` instance loaded from the file.
Raises
------
ValueError
If the pickled object is not a ``Learner`` instance.
ValueError
If the pickled version of the ``Learner`` instance is out of date.
"""
skll_version, learner = joblib.load(learner_path)
# For backward compatibility, convert string model types to labels.
if isinstance(learner._model_type, string_types):
learner._model_type = globals()[learner._model_type]
# Check that we've actually loaded a Learner (or sub-class)
if not isinstance(learner, cls):
raise ValueError(('The pickle stored at {} does not contain ' +
'a {} object.').format(learner_path, cls))
# Check that versions are compatible. (Currently, this just checks
# that major versions match)
elif skll_version >= (0, 9, 17):
if not hasattr(learner, 'sampler'):
learner.sampler = None
# From v0.17.0 onwards, scikit-learn requires all scalers to have
# the `scale_` instead of the `std_` parameter. So, we need to
# make all old models adapt to this.
if hasattr(learner, 'scaler'):
new_scaler = copy.copy(learner.scaler)
# We need to use `__dict__` because the `std_` has been
# overridden to just return the `scale_` value, and we
# need the original value of `std_`.
if (not hasattr(new_scaler, 'scale_') and
'std_' in new_scaler.__dict__):
new_scaler.scale_ = new_scaler.__dict__['std_']
learner.scaler = new_scaler
return learner
else:
raise ValueError(("{} stored in pickle file {} was " +
"created with version {} of SKLL, which is " +
"incompatible with the current version " +
"{}").format(cls, learner_path,
'.'.join(skll_version),
'.'.join(VERSION)))
@property
def model_type(self):
"""
The model type (i.e., the class)
"""
return self._model_type
@property
def model_kwargs(self):
"""
A dictionary of the underlying scikit-learn model's keyword arguments
"""
return self._model_kwargs
@property
def model(self):
"""
The underlying scikit-learn model
"""
return self._model
[docs] def load(self, learner_path):
"""
Replace the current learner instance with a saved learner.
Parameters
----------
learner_path : str
The path to a saved learner object file to load.
"""
del self.__dict__
self.__dict__ = Learner.from_file(learner_path).__dict__
@property
def model_params(self):
"""
Model parameters (i.e., weights) for a ``LinearModel`` (e.g., ``Ridge``)
regression and liblinear models.
Returns
-------
res : dict
A dictionary of labeled weights.
intercept : dict
A dictionary of intercept(s).
Raises
------
ValueError
If the instance does not support model parameters.
"""
res = {}
intercept = None
if (isinstance(self._model, LinearModel) or
(isinstance(self._model, SVR) and
self._model.kernel == 'linear') or
isinstance(self._model, SGDRegressor)):
# also includes RescaledRidge, RescaledSVR, RescaledSGDRegressor
coef = self.model.coef_
intercept = {'_intercept_': self.model.intercept_}
# convert SVR coefficient format (1 x matrix) to array
if isinstance(self._model, SVR):
coef = coef.toarray()[0]
# inverse transform to get indices for before feature selection
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
if coef[idx]:
res[feat] = coef[idx]
elif isinstance(self._model, LinearSVC) or isinstance(self._model, LogisticRegression):
label_list = self.label_list
# if there are only two labels, scikit-learn will only have one
# set of parameters and they will be associated with label 1 (not
# 0)
if len(self.label_list) == 2:
label_list = self.label_list[-1:]
for i, label in enumerate(label_list):
coef = self.model.coef_[i]
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
if coef[idx]:
res['{}\t{}'.format(label, feat)] = coef[idx]
if isinstance(self.model.intercept_, float):
intercept = {'_intercept_': self.model.intercept_}
elif self.model.intercept_.any():
intercept = dict(zip(label_list, self.model.intercept_))
else:
# not supported
raise ValueError(("{} is not supported by" +
" model_params with its current settings."
).format(self._model_type.__name__))
return res, intercept
@property
def probability(self):
"""
Should learner return probabilities of all labels (instead of just
label with highest probability)?
"""
return self._probability
@probability.setter
def probability(self, value):
"""
Set the probabilities flag (i.e. whether learner
should return probabilities of all labels).
Parameters
----------
value : bool
Whether learner should return probabilities of all labels.
"""
# LinearSVC doesn't support predict_proba
self._probability = value
if not hasattr(self.model_type, "predict_proba") and value:
self.logger.warning("Probability was set to True, but {} does not have "
"a predict_proba() method.".format(self.model_type.__name__))
self._probability = False
def __getstate__(self):
"""
Return the attributes that should be pickled. We need this
because we cannot pickle loggers.
"""
attribute_dict = dict(self.__dict__)
del attribute_dict['logger']
return attribute_dict
[docs] def save(self, learner_path):
"""
Save the ``Learner`` instance to a file.
Parameters
----------
learner_path : str
The path to save the ``Learner`` instance to.
"""
# create the directory if it doesn't exist
learner_dir = os.path.dirname(learner_path)
if not os.path.exists(learner_dir):
os.makedirs(learner_dir)
# write out the learner to disk
joblib.dump((VERSION, self), learner_path)
def _create_estimator(self):
"""
Create an estimator.
Returns
-------
estimator
The estimator that was created.
default_param_grid : list of dicts
The parameter grid for the estimator.
Raises
------
ValueError
If there is no default parameter grid for estimator.
"""
estimator = None
default_param_grid = _find_default_param_grid(self._model_type)
if default_param_grid is None:
raise ValueError("%s is not a valid learner type." %
(self._model_type.__name__,))
estimator = self._model_type(**self._model_kwargs)
return estimator, default_param_grid
def _check_input_formatting(self, examples):
"""
check that the examples are properly formatted.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to use for training.
Raises
------
TypeError
If labels are strings.
TypeError
If any features are strings.
"""
# Make sure the labels for a regression task are not strings.
if self.model_type._estimator_type == 'regressor':
for label in examples.labels:
if isinstance(label, string_types):
raise TypeError("You are doing regression with string "
"labels. Convert them to integers or "
"floats.")
# make sure that feature values are not strings
for val in examples.features.data:
if isinstance(val, string_types):
raise TypeError("You have feature values that are strings. "
"Convert them to floats.")
def _check_max_feature_value(self, feat_array):
"""
Check if the the maximum absolute value of any feature is too large
Parameters
----------
feat_array : np.array
A numpy array with features.
"""
max_feat_abs = np.max(np.abs(feat_array.data))
if max_feat_abs > 1000.0:
self.logger.warning("You have a feature with a very large absolute "
"value ({}). That may cause the learning "
"algorithm to crash or perform "
"poorly.".format(max_feat_abs))
def _create_label_dict(self, examples):
"""
Creates a dictionary of labels for classification problems.
Parameters
----------
examples : skll.FeatureSet
The examples to use for training.
"""
# We don't need to do this for regression models, so return.
if self.model_type._estimator_type == 'regressor':
return
# extract list of unique labels if we are doing classification
self.label_list = np.unique(examples.labels).tolist()
# if one label is specified as the positive class, make sure it's
# last
if self.pos_label_str:
self.label_list = sorted(self.label_list,
key=lambda x: (x == self.pos_label_str,
x))
# Given a list of all labels in the dataset and a list of the
# unique labels in the set, convert the first list to an array of
# numbers.
self.label_dict = {label: i for i, label in enumerate(self.label_list)}
def _train_setup(self, examples):
"""
Set up the feature vectorizer and the scaler.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to use for training.
"""
# Check feature values and labels
self._check_input_formatting(examples)
# Create feature name -> value mapping
self.feat_vectorizer = examples.vectorizer
# initialize feature selector
self.feat_selector = SelectByMinCount(
min_count=self._min_feature_count)
# Create scaler if we weren't passed one and it's necessary
if not issubclass(self._model_type, MultinomialNB):
if self._feature_scaling != 'none':
scale_with_mean = self._feature_scaling in {
'with_mean', 'both'}
scale_with_std = self._feature_scaling in {'with_std', 'both'}
self.scaler = StandardScaler(copy=True,
with_mean=scale_with_mean,
with_std=scale_with_std)
else:
# Doing this is to prevent any modification of feature values
# using a dummy transformation
self.scaler = StandardScaler(copy=False,
with_mean=False,
with_std=False)
[docs] def train(self, examples, param_grid=None, grid_search_folds=3,
grid_search=True, grid_objective='f1_score_micro',
grid_jobs=None, shuffle=False, create_label_dict=True):
"""
Train a classification model and return the model, score, feature
vectorizer, scaler, label dictionary, and inverse label dictionary.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to use for training.
param_grid : list of dicts, optional
The parameter grid to search through for grid
search. If ``None``, a default parameter grid
will be used.
Defaults to ``None``.
grid_search_folds : int or dict, optional
The number of folds to use when doing the
grid search, or a mapping from
example IDs to folds.
Defaults to 3.
grid_search : bool, optional
Should we do grid search?
Defaults to ``True``.
grid_objective : str, optional
The name of the objective function to use when
doing the grid search.
Defaults to ``'f1_score_micro'``.
grid_jobs : int, optional
The number of jobs to run in parallel when doing the
grid search. If ``None`` or 0, the number of
grid search folds will be used.
Defaults to ``None``.
shuffle : bool, optional
Shuffle examples (e.g., for grid search CV.)
Defaults to ``False``.
create_label_dict : bool, optional
Should we create the label dictionary? This
dictionary is used to map between string
labels and their corresponding numerical
values. This should only be done once per
experiment, so when ``cross_validate`` calls
``train``, ``create_label_dict`` gets set to
``False``.
Defaults to ``True``.
Returns
-------
grid_score : float
The best grid search objective function score, or 0 if we're
not doing grid search.
Raises
------
ValueError
If grid_objective is not a valid grid objective.
MemoryError
If process runs out of memory converting training data to dense.
ValueError
If FeatureHasher is used with MultinomialNB.
"""
# if we are asked to do grid search, check that the grid objective
# function is valid for the selected learner
if grid_search:
if self.model_type._estimator_type == 'regressor':
# types 2-4 are valid for all regression models
if grid_objective in _CLASSIFICATION_ONLY_OBJ_FUNCS:
raise ValueError("{} is not a valid grid objective "
"function for the {} learner"
.format(grid_objective,
self._model_type.__name__))
elif grid_objective not in _CLASSIFICATION_ONLY_OBJ_FUNCS:
# This is a classifier. Valid objective functions depend on
# type of label (int, string, binary)
if issubclass(examples.labels.dtype.type, int):
# If they're ints, class 1 and 2 are valid for classifiers,
if grid_objective not in _INT_CLASS_OBJ_FUNCS:
raise ValueError("{} is not a valid grid objective "
"function for the {} learner with "
"integer labels"
.format(grid_objective,
self._model_type.__name__))
elif issubclass(examples.labels.dtype.type, str):
# if all of the labels are strings, only class 1 objectives
# are valid (with a classifier).
raise ValueError("{} is not a valid grid objective "
"function for the {} learner with string "
"labels".format(grid_objective,
self._model_type.__name__))
elif len(set(examples.labels)) == 2:
# If there are two labels, class 3 objectives are valid for
# classifiers regardless of the type of the label.
if grid_objective not in _BINARY_CLASS_OBJ_FUNCS:
raise ValueError("{} is not a valid grid objective "
"function for the {} learner with "
"binary labels"
.format(grid_objective,
self._model_type.__name__))
elif grid_objective in _REGRESSION_ONLY_OBJ_FUNCS:
# simple backoff check for mixed-type labels
raise ValueError("{} is not a valid grid objective "
"function for the {} learner"
.format(grid_objective,
self._model_type.__name__))
# Shuffle so that the folds are random for the inner grid search CV.
# If grid search is True but shuffle isn't, shuffle anyway.
# You can't shuffle a scipy sparse matrix in place, so unfortunately
# we make a copy of everything (and then get rid of the old version)
if grid_search or shuffle:
if grid_search and not shuffle:
self.logger.warning('Training data will be shuffled to randomize '
'grid search folds. Shuffling may yield '
'different results compared to scikit-learn.')
ids, labels, features = sk_shuffle(examples.ids, examples.labels,
examples.features,
random_state=123456789)
examples = FeatureSet(examples.name, ids, labels=labels,
features=features,
vectorizer=examples.vectorizer)
# call train setup to set up the vectorizer, the labeldict, and the
# scaler
if create_label_dict:
self._create_label_dict(examples)
self._train_setup(examples)
# select features
xtrain = self.feat_selector.fit_transform(examples.features)
# Convert to dense if necessary
if self._use_dense_features:
try:
xtrain = xtrain.todense()
except MemoryError:
if issubclass(self._model_type, _REQUIRES_DENSE):
reason = ('{} does not support sparse ' +
'matrices.').format(self._model_type.__name__)
else:
reason = ('{} feature scaling requires a dense ' +
'matrix.').format(self._feature_scaling)
raise MemoryError('Ran out of memory when converting training '
'data to dense. This was required because ' +
reason)
if isinstance(self.feat_vectorizer, FeatureHasher) and \
issubclass(self._model_type, MultinomialNB):
raise ValueError('Cannot use FeatureHasher with MultinomialNB, '
'because MultinomialNB cannot handle negative '
'feature values.')
# Scale features if necessary
if not issubclass(self._model_type, MultinomialNB):
xtrain = self.scaler.fit_transform(xtrain)
# check whether any feature values are too large
self._check_max_feature_value(xtrain)
# Sampler
if self.sampler:
self.logger.warning('Sampler converts sparse matrix to dense')
if isinstance(self.sampler, SkewedChi2Sampler):
self.logger.warning('SkewedChi2Sampler uses a dense matrix')
xtrain = self.sampler.fit_transform(xtrain.todense())
else:
xtrain = self.sampler.fit_transform(xtrain)
# use label dict transformed version of examples.labels if doing
# classification
if self.model_type._estimator_type == 'classifier':
labels = np.array([self.label_dict[label] for label in
examples.labels])
else:
labels = examples.labels
# Instantiate an estimator and get the default parameter grid to search
estimator, default_param_grid = self._create_estimator()
# Use default parameter grid if we weren't passed one
# In case the default parameter grid is also empty
# then there's no point doing the grid search at all
if grid_search and not param_grid:
if default_param_grid == [{}]:
self.logger.warning("SKLL has no default parameter grid "
"available for the {} learner and no "
"parameter grids were supplied. Using "
"default values instead of grid "
"search.".format(self._model_type.__name__))
grid_search = False
else:
param_grid = default_param_grid
# set up a grid searcher if we are asked to
if grid_search:
# set up grid search folds
if isinstance(grid_search_folds, int):
grid_search_folds = \
self._compute_num_folds_from_example_counts(grid_search_folds, labels)
if not grid_jobs:
grid_jobs = grid_search_folds
else:
grid_jobs = min(grid_search_folds, grid_jobs)
folds = grid_search_folds
else:
# use the number of unique fold IDs as the number of grid jobs
num_specified_folds = len(set(grid_search_folds.values()))
if not grid_jobs:
grid_jobs = num_specified_folds
else:
grid_jobs = min(num_specified_folds, grid_jobs)
# Only retain IDs within folds if they're in grid_search_folds
dummy_label = next(itervalues(grid_search_folds))
fold_groups = [grid_search_folds.get(curr_id, dummy_label) for
curr_id in examples.ids]
kfold = FilteredLeaveOneGroupOut(grid_search_folds, examples.ids)
folds = kfold.split(examples.features, examples.labels, fold_groups)
# If we're using a correlation metric for doing binary
# classification, override the estimator's predict function
if (grid_objective in _CORRELATION_METRICS and
self.model_type._estimator_type == 'classifier'):
estimator.predict_normal = estimator.predict
estimator.predict = _predict_binary
# limit the number of grid_jobs to be no higher than five or the
# number of cores for the machine, whichever is lower
grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES)
grid_searcher = GridSearchCV(estimator, param_grid,
scoring=grid_objective,
cv=folds,
n_jobs=grid_jobs,
pre_dispatch=grid_jobs)
# run the grid search for hyperparameters
grid_searcher.fit(xtrain, labels)
self._model = grid_searcher.best_estimator_
grid_score = grid_searcher.best_score_
else:
self._model = estimator.fit(xtrain, labels)
grid_score = 0.0
return grid_score
[docs] def evaluate(self, examples, prediction_prefix=None, append=False,
grid_objective=None, output_metrics=[]):
"""
Evaluates a given model on a given dev or test ``FeatureSet``.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to evaluate the performance of the model on.
prediction_prefix : str, optional
If saving the predictions, this is the
prefix that will be used for the filename.
It will be followed by ``"_predictions.tsv"``
Defaults to ``None``.
append : bool, optional
Should we append the current predictions to the file if
it exists?
Defaults to ``False``.
grid_objective : function, optional
The objective function that was used when doing
the grid search.
Defaults to ``None``.
output_metrics : list of str, optional
List of additional metric names to compute in
addition to grid objective. Empty by default.
Defaults to an empty list.
Returns
-------
res : 6-tuple
The confusion matrix, the overall accuracy, the per-label
PRFs, the model parameters, the grid search objective
function score, and the additional evaluation metrics, if any.
"""
# initialize a dictionary that will hold all of the metric scores
metric_scores = {metric: None for metric in output_metrics}
# make the prediction on the test data
yhat = self.predict(examples,
prediction_prefix=prediction_prefix,
append=append)
# make a single list of metrics including the grid objective
# since it's easier to compute everything together
metrics_to_compute = [grid_objective] + output_metrics
# extract actual labels (transformed for classification tasks)
if self.model_type._estimator_type == 'classifier':
test_label_list = np.unique(examples.labels).tolist()
# identify unseen test labels if any and add a new dictionary for these
# labels
unseen_test_label_list = [label for label in test_label_list
if not label in self.label_list]
unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list,
start=len(self.label_list))}
# combine the two dictionaries
train_and_test_label_dict = self.label_dict.copy()
train_and_test_label_dict.update(unseen_label_dict)
ytest = np.array([train_and_test_label_dict[label]
for label in examples.labels])
else:
ytest = examples.labels
# compute all of the metrics that we need to but save the original
# predictions since we will need to use those for each metric
original_yhat = yhat
for metric in metrics_to_compute:
# if run in probability mode, convert yhat to list of labels predicted
if self.probability:
# if we're using a correlation grid objective, calculate it here
if metric and metric in _CORRELATION_METRICS:
try:
metric_scores[metric] = use_score_func(metric, ytest, yhat[:, 1])
except ValueError:
metric_scores[metric] = float('NaN')
yhat = np.array([max(range(len(row)),
key=lambda i: row[i])
for row in original_yhat])
# calculate grid search objective function score, if specified
if (metric and (metric not in _CORRELATION_METRICS or
not self.probability)):
try:
metric_scores[metric] = use_score_func(metric, ytest, yhat)
except ValueError:
metric_scores[metric] = float('NaN')
# now separate out the grid objective score from the additional metric scores
# if a grid objective was actually passed in. If no objective was passed in
# then that score should just be none.
objective_score = None
additional_scores = metric_scores.copy()
if grid_objective:
objective_score = metric_scores[grid_objective]
del additional_scores[grid_objective]
if self.model_type._estimator_type == 'regressor':
result_dict = {'descriptive': defaultdict(dict)}
for table_label, y in zip(['actual', 'predicted'], [ytest, yhat]):
result_dict['descriptive'][table_label]['min'] = min(y)
result_dict['descriptive'][table_label]['max'] = max(y)
result_dict['descriptive'][table_label]['avg'] = np.mean(y)
result_dict['descriptive'][table_label]['std'] = np.std(y)
result_dict['pearson'] = use_score_func('pearson', ytest, yhat)
res = (None, None, result_dict, self._model.get_params(), objective_score,
additional_scores)
else:
# compute the confusion matrix
num_labels = len(train_and_test_label_dict)
conf_mat = confusion_matrix(ytest, yhat,
labels=list(range(num_labels)))
# Calculate metrics
overall_accuracy = accuracy_score(ytest, yhat)
result_matrix = precision_recall_fscore_support(
ytest, yhat, labels=list(range(num_labels)), average=None)
# Store results
result_dict = defaultdict(dict)
for actual_label in sorted(train_and_test_label_dict):
col = train_and_test_label_dict[actual_label]
result_dict[actual_label]["Precision"] = result_matrix[0][col]
result_dict[actual_label]["Recall"] = result_matrix[1][col]
result_dict[actual_label]["F-measure"] = result_matrix[2][col]
res = (conf_mat.tolist(), overall_accuracy, result_dict,
self._model.get_params(), objective_score,
additional_scores)
return res
[docs] def predict(self, examples, prediction_prefix=None, append=False,
class_labels=False):
"""
Uses a given model to generate predictions on a given ``FeatureSet``.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to predict labels for.
prediction_prefix : str, optional
If saving the predictions, this is the prefix that will be used for
the filename. It will be followed by ``"_predictions.tsv"``
Defaults to ``None``.
append : bool, optional
Should we append the current predictions to the file if it exists?
Defaults to ``False``.
class_labels : bool, optional
For classifier, should we convert class indices to their (str) labels?
Defaults to ``False``.
Returns
-------
yhat : array-like
The predictions returned by the ``Learner`` instance.
Raises
------
MemoryError
If process runs out of memory when converting to dense.
"""
example_ids = examples.ids
# Need to do some transformations so the features are in the right
# columns for the test set. Obviously a bit hacky, but storing things
# in sparse matrices saves memory over our old list of dicts approach.
if isinstance(self.feat_vectorizer, FeatureHasher):
if (self.feat_vectorizer.n_features !=
examples.vectorizer.n_features):
self.logger.warning("There is mismatch between the training model "
"features and the data passed to predict.")
self_feat_vec_tuple = (self.feat_vectorizer.dtype,
self.feat_vectorizer.input_type,
self.feat_vectorizer.n_features,
self.feat_vectorizer.non_negative)
example_feat_vec_tuple = (examples.vectorizer.dtype,
examples.vectorizer.input_type,
examples.vectorizer.n_features,
examples.vectorizer.non_negative)
if self_feat_vec_tuple == example_feat_vec_tuple:
xtest = examples.features
else:
xtest = self.feat_vectorizer.transform(
examples.vectorizer.inverse_transform(
examples.features))
else:
if (set(self.feat_vectorizer.feature_names_) !=
set(examples.vectorizer.feature_names_)):
self.logger.warning("There is mismatch between the training model "
"features and the data passed to predict.")
if self.feat_vectorizer == examples.vectorizer:
xtest = examples.features
else:
xtest = self.feat_vectorizer.transform(
examples.vectorizer.inverse_transform(
examples.features))
# filter features based on those selected from training set
xtest = self.feat_selector.transform(xtest)
# Sampler
if self.sampler:
self.logger.warning('Sampler converts sparse matrix to dense')
if isinstance(self.sampler, SkewedChi2Sampler):
self.logger.warning('SkewedChi2Sampler uses a dense matrix')
xtest = self.sampler.fit_transform(xtest.todense())
else:
xtest = self.sampler.fit_transform(xtest)
# Convert to dense if necessary
if self._use_dense_features and not isinstance(xtest, np.ndarray):
try:
xtest = xtest.todense()
except MemoryError:
if issubclass(self._model_type, _REQUIRES_DENSE):
reason = ('{} does not support sparse ' +
'matrices.').format(self._model_type.__name__)
else:
reason = ('{} feature scaling requires a dense ' +
'matrix.').format(self._feature_scaling)
raise MemoryError('Ran out of memory when converting test ' +
'data to dense. This was required because ' +
reason)
# Scale xtest if necessary
if not issubclass(self._model_type, MultinomialNB):
xtest = self.scaler.transform(xtest)
# make the prediction on the test data
try:
yhat = (self._model.predict_proba(xtest)
if (self.probability and
not class_labels)
else self._model.predict(xtest))
except NotImplementedError as e:
self.logger.error("Model type: {}\n"
"Model: {}\n"
"Probability: {}\n".format(self._model_type.__name__,
self._model,
self.probability))
raise e
# write out the predictions if we are asked to
if prediction_prefix is not None:
prediction_file = '{}_predictions.tsv'.format(prediction_prefix)
with open(prediction_file,
"w" if not append else "a") as predictionfh:
# header
if not append:
# Output probabilities if we're asked (and able)
if self.probability:
print('\t'.join(["id"] +
[str(x) for x in self.label_list]),
file=predictionfh)
else:
print('id\tprediction', file=predictionfh)
if self.probability:
for example_id, class_probs in zip(example_ids, yhat):
print('\t'.join([str(example_id)] +
[str(x) for x in class_probs]),
file=predictionfh)
else:
if self.model_type._estimator_type == 'regressor':
for example_id, pred in zip(example_ids, yhat):
print('{0}\t{1}'.format(example_id, pred),
file=predictionfh)
else:
for example_id, pred in zip(example_ids, yhat):
print('%s\t%s' % (example_id,
self.label_list[int(pred)]),
file=predictionfh)
if (class_labels and
self.model_type._estimator_type == 'classifier'):
yhat = np.array([self.label_list[int(pred)] for pred in yhat])
return yhat
def _compute_num_folds_from_example_counts(self, cv_folds, labels):
"""
Calculate the number of folds we should use for cross validation, based
on the number of examples we have for each label.
Parameters
----------
cv_folds : int
The number of cross-validation folds.
labels : list
The example labels.
Returns
-------
cv_folds : int
The number of folds to use, based on the number of examples
for each label.
Raises
------
AssertionError
If ```cv_folds``` is not an integer.
ValueError
If the training set has less than or equal to one label(s).
"""
assert isinstance(cv_folds, int)
# For regression models, we can just return the current cv_folds
if self.model_type._estimator_type == 'regressor':
return cv_folds
min_examples_per_label = min(Counter(labels).values())
if min_examples_per_label <= 1:
raise ValueError(('The training set has only {} example for a' +
' label.').format(min_examples_per_label))
if min_examples_per_label < cv_folds:
self.logger.warning('The minimum number of examples per label was {}. '
'Setting the number of cross-validation folds to '
'that value.'.format(min_examples_per_label))
cv_folds = min_examples_per_label
return cv_folds
[docs] def cross_validate(self,
examples,
stratified=True,
cv_folds=10,
grid_search=False,
grid_search_folds=3,
grid_jobs=None,
grid_objective='f1_score_micro',
output_metrics=[],
prediction_prefix=None,
param_grid=None,
shuffle=False,
save_cv_folds=False,
use_custom_folds_for_grid_search=True):
"""
Cross-validates a given model on the training examples.
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to cross-validate learner performance on.
stratified : bool, optional
Should we stratify the folds to ensure an even
distribution of labels for each fold?
Defaults to ``True``.
cv_folds : int, optional
The number of folds to use for cross-validation, or
a mapping from example IDs to folds.
Defaults to 10.
grid_search : bool, optional
Should we do grid search when training each fold?
Note: This will make this take *much* longer.
Defaults to ``False``.
grid_search_folds : int or dict, optional
The number of folds to use when doing the
grid search, or a mapping from
example IDs to folds.
Defaults to 3.
grid_jobs : int, optional
The number of jobs to run in parallel when doing the
grid search. If ``None`` or 0, the number of
grid search folds will be used.
Defaults to ``None``.
grid_objective : str, optional
The name of the objective function to use when
doing the grid search.
Defaults to ``'f1_score_micro'``.
output_metrics : list of str, optional
List of additional metric names to compute in
addition to the metric used for grid search. Empty
by default.
Defaults to an empty list.
prediction_prefix : str, optional
If saving the predictions, this is the
prefix that will be used for the filename.
It will be followed by ``"_predictions.tsv"``
Defaults to ``None``.
param_grid : list of dicts, optional
The parameter grid to traverse.
Defaults to ``None``.
shuffle : bool, optional
Shuffle examples before splitting into folds for CV.
Defaults to ``False``.
save_cv_folds : bool, optional
Whether to save the cv fold ids or not?
Defaults to ``False``.
use_custom_folds_for_grid_search : bool, optional
If ``cv_folds`` is a custom dictionary, but
``grid_search_folds`` is not, perhaps due to user
oversight, should the same custom dictionary
automatically be used for the inner grid-search
cross-validation?
Defaults to ``True``.
Returns
-------
results : list of 6-tuples
The confusion matrix, overall accuracy, per-label PRFs, model
parameters, objective function score, and evaluation metrics (if any)
for each fold.
grid_search_scores : list of floats
The grid search scores for each fold.
skll_fold_ids : dict
A dictionary containing the test-fold number for each id
if ``save_cv_folds`` is ``True``, otherwise ``None``.
Raises
------
ValueError
If labels are not encoded as strings.
"""
# Seed the random number generator so that randomized algorithms are
# replicable.
random_state = np.random.RandomState(123456789)
# We need to check whether the labels in the featureset are labels
# or continuous values. If it's the latter, we need to raise an
# an exception since the stratified splitting in sklearn does not
# work with continuous labels. Note that although using random folds
# _will_ work, we want to raise an error in general since it's better
# to encode the labels as strings anyway.
if (self.model_type._estimator_type == 'classifier' and
type_of_target(examples.labels) not in ['binary', 'multiclass']):
raise ValueError("Floating point labels must be encoded as strings for cross-validation.")
# Shuffle so that the folds are random for the inner grid search CV.
# If grid search is True but shuffle isn't, shuffle anyway.
# You can't shuffle a scipy sparse matrix in place, so unfortunately
# we make a copy of everything (and then get rid of the old version)
if grid_search or shuffle:
if grid_search and not shuffle:
self.logger.warning('Training data will be shuffled to randomize '
'grid search folds. Shuffling may yield '
'different results compared to scikit-learn.')
ids, labels, features = sk_shuffle(examples.ids, examples.labels,
examples.features,
random_state=random_state)
examples = FeatureSet(examples.name, ids, labels=labels,
features=features,
vectorizer=examples.vectorizer)
# call train setup
self._create_label_dict(examples)
self._train_setup(examples)
# Set up the cross-validation iterator.
if isinstance(cv_folds, int):
cv_folds = self._compute_num_folds_from_example_counts(cv_folds,
examples.labels)
stratified = (stratified and
self.model_type._estimator_type == 'classifier')
if stratified:
kfold = StratifiedKFold(n_splits=cv_folds)
cv_groups = None
else:
kfold = KFold(n_splits=cv_folds, random_state=random_state)
cv_groups = None
# Otherwise cv_folds is a dict
else:
# if we have a mapping from IDs to folds, use it for the overall
# cross-validation as well as the grid search within each
# training fold. Note that this means that the grid search
# will use K-1 folds because the Kth will be the test fold for
# the outer cross-validation.
dummy_label = next(itervalues(cv_folds))
fold_groups = [cv_folds.get(curr_id, dummy_label) for curr_id in examples.ids]
# Only retain IDs within folds if they're in cv_folds
kfold = FilteredLeaveOneGroupOut(cv_folds, examples.ids)
cv_groups = fold_groups
# If we are planning to do grid search, set the grid search folds
# to be the same as the custom cv folds unless a flag is set that
# explicitly tells us not to. Note that this should only happen
# when we are using the API; otherwise the configparser should
# take care of this even before this method is called
if grid_search and use_custom_folds_for_grid_search and grid_search_folds != cv_folds:
self.logger.warning("The specified custom folds will be used for "
"the inner grid search.")
grid_search_folds = cv_folds
# Save the cross-validation fold information, if required
# The format is that the test-fold that each id appears in is stored
skll_fold_ids = None
if save_cv_folds:
skll_fold_ids = {}
for fold_num, (_, test_indices) in enumerate(kfold.split(examples.features,
examples.labels,
cv_groups)):
for index in test_indices:
skll_fold_ids[examples.ids[index]] = str(fold_num)
# handle each fold separately and accumulate the predictions and the
# numbers
results = []
grid_search_scores = []
append_predictions = False
for train_index, test_index in kfold.split(examples.features,
examples.labels,
cv_groups):
# Train model
self._model = None # prevent feature vectorizer from being reset.
train_set = FeatureSet(examples.name,
examples.ids[train_index],
labels=examples.labels[train_index],
features=examples.features[train_index],
vectorizer=examples.vectorizer)
# Set run_create_label_dict to False since we already created the
# label dictionary for the whole dataset above.
grid_search_score = self.train(train_set,
grid_search_folds=grid_search_folds,
grid_search=grid_search,
grid_objective=grid_objective,
param_grid=param_grid,
grid_jobs=grid_jobs,
shuffle=grid_search,
create_label_dict=False)
grid_search_scores.append(grid_search_score)
# note: there is no need to shuffle again within each fold,
# regardless of what the shuffle keyword argument is set to.
# Evaluate model
test_tuple = FeatureSet(examples.name,
examples.ids[test_index],
labels=examples.labels[test_index],
features=examples.features[test_index],
vectorizer=examples.vectorizer)
results.append(self.evaluate(test_tuple,
prediction_prefix=prediction_prefix,
append=append_predictions,
grid_objective=grid_objective,
output_metrics=output_metrics))
append_predictions = True
# return list of results for all folds
return results, grid_search_scores, skll_fold_ids
[docs] def learning_curve(self,
examples,
cv_folds=10,
train_sizes=np.linspace(0.1, 1.0, 5),
metric='f1_score_micro'):
"""
Generates learning curves for a given model on the training examples
via cross-validation. Adapted from the scikit-learn code for learning
curve generation (cf.``sklearn.model_selection.learning_curve``).
Parameters
----------
examples : skll.FeatureSet
The ``FeatureSet`` instance to generate the learning curve on.
cv_folds : int, optional
The number of folds to use for cross-validation, or
a mapping from example IDs to folds.
Defaults to 10.
train_sizes : list of float or int, optional
Relative or absolute numbers of training examples
that will be used to generate the learning curve.
If the type is float, it is regarded as a fraction
of the maximum size of the training set (that is
determined by the selected validation method),
i.e. it has to be within (0, 1]. Otherwise it
is interpreted as absolute sizes of the training
sets. Note that for classification the number of
samples usually have to be big enough to contain
at least one sample from each class.
Defaults to ``np.linspace(0.1, 1.0, 5)``.
metric : str, optional
The name of the metric function to use
when computing the train and test scores
for the learning curve. (default: 'f1_score_micro')
Defaults to ``'f1_score_micro'``.
Returns
-------
train_scores : list of float
The scores for the training set.
test_scores : list of float
The scores on the test set.
num_examples : list of int
The numbers of training examples used to generate
the curve
"""
# Seed the random number generator so that randomized algorithms are
# replicable.
random_state = np.random.RandomState(123456789)
# Call train setup before since we need to train
# the learner eventually
self._create_label_dict(examples)
self._train_setup(examples)
# Set up the cross-validation iterator with 20% of the data
# always reserved for testing
cv = ShuffleSplit(n_splits=cv_folds,
test_size=0.2,
random_state=random_state)
cv_iter = list(cv.split(examples.features, examples.labels, None))
n_max_training_samples = len(cv_iter[0][0])
# Get the _translate_train_sizes() function from scikit-learn
# since we need it to get the right list of sizes after cross-validation
_module = import_module('sklearn.model_selection._validation')
_translate_train_sizes = getattr(_module, '_translate_train_sizes')
train_sizes_abs = _translate_train_sizes(train_sizes,
n_max_training_samples)
n_unique_ticks = train_sizes_abs.shape[0]
# Create an iterator over train/test featuresets based on the
# cross-validation index iterator
featureset_iter = (FeatureSet.split_by_ids(examples, train, test) for train, test in cv_iter)
# Limit the number of parallel jobs for this
# to be no higher than five or the number of cores
# for the machine, whichever is lower
n_jobs = min(cpu_count(), MAX_CONCURRENT_PROCESSES)
# Run jobs in parallel that train the model on each subset
# of the training data and compute train and test scores
parallel = joblib.Parallel(n_jobs=n_jobs, pre_dispatch=n_jobs)
out = parallel(joblib.delayed(_train_and_score)(self,
train_fs[:n_train_samples],
test_fs,
metric)
for train_fs, test_fs in featureset_iter
for n_train_samples in train_sizes_abs)
# Reshape the outputs
out = np.array(out)
n_cv_folds = out.shape[0] // n_unique_ticks
out = out.reshape(n_cv_folds, n_unique_ticks, 2)
out = np.asarray(out).transpose((2, 1, 0))
return list(out[0]), list(out[1]), list(train_sizes_abs)