Source code for skll.learner

# License: BSD 3 clause
"""
Provides easy-to-use wrapper around scikit-learn.

:author: Michael Heilman (mheilman@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:author: Dan Blanchard (dblanchard@ets.org)
:author: Aoife Cahill (acahill@ets.org)
:organization: ETS
"""
# pylint: disable=F0401,W0622,E1002,E1101

from __future__ import absolute_import, print_function, unicode_literals

import copy
import inspect
import logging
import os
import sys
from collections import Counter, defaultdict
from functools import wraps
from importlib import import_module
from multiprocessing import cpu_count

import joblib
import numpy as np
import scipy.sparse as sp
from six import iteritems, itervalues
from six import string_types
from six.moves import xrange as range
from six.moves import zip
from sklearn.model_selection import (GridSearchCV,
                                     KFold,
                                     LeaveOneGroupOut,
                                     ShuffleSplit,
                                     StratifiedKFold)
from sklearn.ensemble import (AdaBoostClassifier,
                              AdaBoostRegressor,
                              GradientBoostingClassifier,
                              GradientBoostingRegressor,
                              RandomForestClassifier,
                              RandomForestRegressor)
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_selection import SelectKBest
# AdditiveChi2Sampler is used indirectly, so ignore linting message
from sklearn.kernel_approximation import (AdditiveChi2Sampler,
                                          Nystroem,
                                          RBFSampler,
                                          SkewedChi2Sampler)
from sklearn.linear_model import (ElasticNet, Lasso, LinearRegression,
                                  LogisticRegression, Ridge, SGDClassifier,
                                  SGDRegressor)
from sklearn.linear_model.base import LinearModel
from sklearn.metrics import (accuracy_score,
                             confusion_matrix,
                             precision_recall_fscore_support,
                             SCORERS)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC, LinearSVR, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils import shuffle as sk_shuffle

from skll.data import FeatureSet
from skll.metrics import _CORRELATION_METRICS, use_score_func
from skll.version import VERSION

# Constants #
_DEFAULT_PARAM_GRIDS = {AdaBoostClassifier:
                        [{'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        AdaBoostRegressor:
                        [{'learning_rate': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        DecisionTreeClassifier:
                        [{'max_features': ["auto", None]}],
                        DecisionTreeRegressor:
                        [{'max_features': ["auto", None]}],
                        ElasticNet:
                        [{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        GradientBoostingClassifier:
                        [{'max_depth': [1, 3, 5]}],
                        GradientBoostingRegressor:
                        [{'max_depth': [1, 3, 5]}],
                        KNeighborsClassifier:
                        [{'n_neighbors': [1, 5, 10, 100],
                          'weights': ['uniform', 'distance']}],
                        KNeighborsRegressor:
                        [{'n_neighbors': [1, 5, 10, 100],
                          'weights': ['uniform', 'distance']}],
                        Lasso:
                        [{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        LinearRegression:
                        [{}],
                        LinearSVC:
                        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        LogisticRegression:
                        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        SVC: [{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
                               'gamma': ['auto', 0.01, 0.1, 1.0, 10.0, 100.0]}],
                        MultinomialNB:
                        [{'alpha': [0.1, 0.25, 0.5, 0.75, 1.0]}],
                        RandomForestClassifier:
                        [{'max_depth': [1, 5, 10, None]}],
                        RandomForestRegressor:
                        [{'max_depth': [1, 5, 10, None]}],
                        Ridge:
                        [{'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        SGDClassifier:
                        [{'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
                          'penalty': ['l1', 'l2', 'elasticnet']}],
                        SGDRegressor:
                        [{'alpha': [0.000001, 0.00001, 0.0001, 0.001, 0.01],
                          'penalty': ['l1', 'l2', 'elasticnet']}],
                        LinearSVR:
                        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0]}],
                        SVR:
                        [{'C': [0.01, 0.1, 1.0, 10.0, 100.0],
                          'gamma': ['auto', 0.01, 0.1, 1.0, 10.0, 100.0]}]}


# list of valid grid objective functions for regression and classification
# models depending on type of labels

_BINARY_CLASS_OBJ_FUNCS = frozenset(['unweighted_kappa',
                                     'linear_weighted_kappa',
                                     'quadratic_weighted_kappa',
                                     'uwk_off_by_one',
                                     'lwk_off_by_one',
                                     'qwk_off_by_one',
                                     'kendall_tau',
                                     'pearson',
                                     'spearman'])

_REGRESSION_ONLY_OBJ_FUNCS = frozenset(['r2',
                                        'neg_mean_squared_error'])

_CLASSIFICATION_ONLY_OBJ_FUNCS = frozenset(['accuracy',
                                            'precision',
                                            'recall',
                                            'f1',
                                            'f1_score_micro',
                                            'f1_score_macro',
                                            'f1_score_weighted',
                                            'f1_score_least_frequent',
                                            'average_precision',
                                            'roc_auc'])

_INT_CLASS_OBJ_FUNCS = frozenset(['unweighted_kappa',
                                  'linear_weighted_kappa',
                                  'quadratic_weighted_kappa',
                                  'uwk_off_by_one',
                                  'lwk_off_by_one',
                                  'qwk_off_by_one'])

_REQUIRES_DENSE = (GradientBoostingClassifier, GradientBoostingRegressor)

MAX_CONCURRENT_PROCESSES = int(os.getenv('SKLL_MAX_CONCURRENT_PROCESSES', '5'))


# pylint: disable=W0223,R0903
[docs]class FilteredLeaveOneGroupOut(LeaveOneGroupOut): """ Version of LeaveOneGroupOut cross-validation iterator that only outputs indices of instances with IDs in a prespecified set. """ def __init__(self, keep, example_ids): super(FilteredLeaveOneGroupOut, self).__init__() self.keep = keep self.example_ids = example_ids self._warned = False self.logger = logging.getLogger(__name__) def split(self, X, y, groups): for train_index, test_index in super(FilteredLeaveOneGroupOut, self).split(X, y, groups): train_len = len(train_index) test_len = len(test_index) train_index = [i for i in train_index if self.example_ids[i] in self.keep] test_index = [i for i in test_index if self.example_ids[i] in self.keep] if not self._warned and (train_len != len(train_index) or test_len != len(test_index)): self.logger.warning('Feature set contains IDs that are not ' + 'in folds dictionary. Skipping those IDs.') self._warned = True yield train_index, test_index
def _find_default_param_grid(cls): """ Finds the default parameter grid for the specified classifier. """ for key_cls, grid in _DEFAULT_PARAM_GRIDS.items(): if issubclass(cls, key_cls): return grid return None def _import_custom_learner(custom_learner_path, custom_learner_name): """ Does the gruntwork of adding the custom model's module to globals. """ if not custom_learner_path: raise ValueError('custom_learner_path was not set and learner {} ' 'was not found.'.format(custom_learner_name)) if not custom_learner_path.endswith('.py'): raise ValueError('custom_learner_path must end in .py ({})' .format(custom_learner_path)) custom_learner_module_name = os.path.basename(custom_learner_path)[:-3] sys.path.append(os.path.dirname(os.path.abspath(custom_learner_path))) import_module(custom_learner_module_name) globals()[custom_learner_name] = \ getattr(sys.modules[custom_learner_module_name], custom_learner_name) def _train_and_score(learner, train_examples, test_examples, objective='f1_score_micro'): """ A utility method to train a given learner instance on the given training examples, generate predictions on the training set itself and also the given test set, and score those predictions using the given objective function. The method returns the train and test scores. Note that this method needs to be a top-level function since it is called from within joblib.Parallel() and, therefore, needs to be picklable which it would not be as an instancemethod of the Learner class. """ _ = learner.train(train_examples, grid_search=False, shuffle=False) train_predictions = learner.predict(train_examples) test_predictions = learner.predict(test_examples) if learner.model_type._estimator_type == 'classifier': test_label_list = np.unique(test_examples.labels).tolist() unseen_test_label_list = [label for label in test_label_list if not label in learner.label_list] unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list, start=len(learner.label_list))} # combine the two dictionaries train_and_test_label_dict = learner.label_dict.copy() train_and_test_label_dict.update(unseen_label_dict) train_labels = np.array([train_and_test_label_dict[label] for label in train_examples.labels]) test_labels = np.array([train_and_test_label_dict[label] for label in test_examples.labels]) else: train_labels = train_examples.labels test_labels = test_examples.labels train_score = use_score_func(objective, train_labels, train_predictions) test_score = use_score_func(objective, test_labels, test_predictions) return train_score, test_score def _predict_binary(self, X): """ Little helper function to allow us to use `GridSearchCV` with objective functions like Kendall's tau for binary classification problems (where the probability of the true class is used as the input to the objective function). This only works if we've also taken the step of storing the old predict function for `self` as `predict_normal`. It's kind of a hack, but it saves us from having to override GridSearchCV to change one little line. :param self: A scikit-learn classifier instance :param X: A set of examples to predict values for. :type X: array """ if self.coef_.shape[0] == 1: res = self.predict_proba(X)[:, 1] else: res = self.predict_normal(X) return res
[docs]class SelectByMinCount(SelectKBest): """ Select features ocurring in more (and/or fewer than) than a specified number of examples in the training data (or a CV training fold). """ def __init__(self, min_count=1): self.min_count = min_count self.scores_ = None def fit(self, X, y=None): # initialize a list of counts of times each feature appears col_counts = [0 for _ in range(X.shape[1])] if sp.issparse(X): # find() is scipy.sparse's equivalent of nonzero() _, col_indices, _ = sp.find(X) else: # assume it's a numpy array (not a numpy matrix) col_indices = X.nonzero()[1].tolist() for i in col_indices: col_counts[i] += 1 self.scores_ = np.array(col_counts) return self def _get_support_mask(self): """ Returns an indication of which features to keep. Adapted from SelectKBest. """ mask = np.zeros(self.scores_.shape, dtype=bool) mask[self.scores_ >= self.min_count] = True return mask
[docs]def rescaled(cls): """ Decorator to create regressors that store a min and a max for the training data and make sure that predictions fall within that range. It also stores the means and SDs of the gold standard and the predictions on the training set to rescale the predictions (e.g., as in e-rater). :param cls: A regressor to add rescaling to. :type cls: BaseEstimator :returns: Modified version of class with rescaled functions added. """ # If this class has already been run through the decorator, return it if hasattr(cls, 'rescale'): return cls # Save original versions of functions to use later. orig_init = cls.__init__ orig_fit = cls.fit orig_predict = cls.predict if cls._estimator_type == 'classifier': raise ValueError('Classifiers cannot be rescaled. ' + 'Only regressors can.') # Define all new versions of functions @wraps(cls.fit) def fit(self, X, y=None): """ Fit a model, then store the mean, SD, max and min of the training set and the mean and SD of the predictions on the training set. """ # fit a regular regression model orig_fit(self, X, y=y) if self.constrain: # also record the training data min and max self.y_min = min(y) self.y_max = max(y) if self.rescale: # also record the means and SDs for the training set y_hat = orig_predict(self, X) self.yhat_mean = np.mean(y_hat) self.yhat_sd = np.std(y_hat) self.y_mean = np.mean(y) self.y_sd = np.std(y) return self @wraps(cls.predict) def predict(self, X): """ Make predictions with the super class, and then adjust them using the stored min, max, means, and standard deviations. """ # get the unconstrained predictions res = orig_predict(self, X) if self.rescale: # convert the predictions to z-scores, # then rescale to match the training set distribution res = (((res - self.yhat_mean) / self.yhat_sd) * self.y_sd) + self.y_mean if self.constrain: # apply min and max constraints res = np.array([max(self.y_min, min(self.y_max, pred)) for pred in res]) return res @classmethod @wraps(cls._get_param_names) def _get_param_names(class_x): """ This is adapted from scikit-learns's BaseEstimator class. It gets the kwargs for the superclass's init method and adds the kwargs for newly added __init__ method. """ try: init = getattr(orig_init, 'deprecated_original', orig_init) args, varargs, _, _ = inspect.getargspec(init) if varargs is not None: raise RuntimeError('scikit-learn estimators should always ' 'specify their parameters in the signature' ' of their init (no varargs).') # Remove 'self' args.pop(0) except TypeError: args = [] rescale_args = inspect.getargspec(class_x.__init__)[0] # Remove 'self' rescale_args.pop(0) args += rescale_args args.sort() return args @wraps(cls.__init__) def init(self, constrain=True, rescale=True, **kwargs): """ This special init function is used by the decorator to make sure that things get initialized in the right order. """ # pylint: disable=W0201 self.constrain = constrain self.rescale = rescale self.y_min = None self.y_max = None self.yhat_mean = None self.yhat_sd = None self.y_mean = None self.y_sd = None orig_init(self, **kwargs) # Override original functions with new ones cls.__init__ = init cls.fit = fit cls.predict = predict cls._get_param_names = _get_param_names cls.rescale = True # Return modified class return cls
# Rescaled regressors @rescaled class RescaledAdaBoostRegressor(AdaBoostRegressor): pass @rescaled class RescaledDecisionTreeRegressor(DecisionTreeRegressor): pass @rescaled class RescaledElasticNet(ElasticNet): pass @rescaled class RescaledGradientBoostingRegressor(GradientBoostingRegressor): pass @rescaled class RescaledKNeighborsRegressor(KNeighborsRegressor): pass @rescaled class RescaledLasso(Lasso): pass @rescaled class RescaledLinearRegression(LinearRegression): pass @rescaled class RescaledRandomForestRegressor(RandomForestRegressor): pass @rescaled class RescaledRidge(Ridge): pass @rescaled class RescaledSVR(SVR): pass @rescaled class RescaledLinearSVR(LinearSVR): pass @rescaled class RescaledSGDRegressor(SGDRegressor): pass
[docs]class Learner(object): """ A simpler learner interface around many scikit-learn classification and regression functions. :param model_type: Type of estimator to create (e.g., LogisticRegression). See the skll package documentation for valid options. :type model_type: str :param probability: Should learner return probabilities of all labels (instead of just label with highest probability)? :type probability: bool :param feature_scaling: how to scale the features, if at all. Options are: 'with_std': scale features using the standard deviation, 'with_mean': center features using the mean, 'both': do both scaling as well as centering, 'none': do neither scaling nor centering :type feature_scaling: str :param model_kwargs: A dictionary of keyword arguments to pass to the initializer for the specified model. :type model_kwargs: dict :param pos_label_str: The string for the positive label in the binary classification setting. Otherwise, an arbitrary label is picked. :type pos_label_str: str :param min_feature_count: The minimum number of examples a feature must have a nonzero value in to be included. :type min_feature_count: int :param sampler: The sampler to use for kernel approximation, if desired. Valid values are: ``'AdditiveChi2Sampler'``, ``'Nystroem'``, ``'RBFSampler'``, and ``'SkewedChi2Sampler'``. :type sampler: str :param sampler_kwargs: A dictionary of keyword arguments to pass to the initializer for the specified sampler. :type sampler_kwargs: dict :param custom_learner_path: Path to module where a custom classifier is defined. :type custom_learner_path: str """ def __init__(self, model_type, probability=False, feature_scaling='none', model_kwargs=None, pos_label_str=None, min_feature_count=1, sampler=None, sampler_kwargs=None, custom_learner_path=None): """ Initializes a learner object with the specified settings. """ super(Learner, self).__init__() self.feat_vectorizer = None self.scaler = None self.label_dict = None self.label_list = None self.pos_label_str = pos_label_str self._model = None self._feature_scaling = feature_scaling self.feat_selector = None self._min_feature_count = min_feature_count self._model_kwargs = {} self._sampler_kwargs = {} if model_type not in globals(): # here, we need to import the custom model and add it # to the appropriate lists of models. _import_custom_learner(custom_learner_path, model_type) model_class = globals()[model_type] default_param_grid = (model_class.default_param_grid() if hasattr(model_class, 'default_param_grid') else [{}]) # ewww, globals :-( global _REQUIRES_DENSE _DEFAULT_PARAM_GRIDS.update({model_class: default_param_grid}) if hasattr(model_class, 'requires_dense') and \ model_class.requires_dense(): _REQUIRES_DENSE = _REQUIRES_DENSE + (model_class,) self._model_type = globals()[model_type] self._probability = None # Use setter to set self.probability self.probability = probability self._use_dense_features = \ (issubclass(self._model_type, _REQUIRES_DENSE) or self._feature_scaling in {'with_mean', 'both'}) # Set default keyword arguments for models that we have some for. if issubclass(self._model_type, SVC): self._model_kwargs['cache_size'] = 1000 self._model_kwargs['probability'] = self.probability if self.probability: logger = logging.getLogger(__name__) logger.warning('Because LibSVM does an internal ' + 'cross-validation to produce probabilities, ' + 'results will not be exactly replicable when ' + 'using SVC and probability mode.') elif issubclass(self._model_type, (RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor)): self._model_kwargs['n_estimators'] = 500 elif issubclass(self._model_type, SVR): self._model_kwargs['cache_size'] = 1000 elif issubclass(self._model_type, SGDClassifier): self._model_kwargs['loss'] = 'log' if issubclass(self._model_type, (RandomForestClassifier, LinearSVC, LogisticRegression, DecisionTreeClassifier, GradientBoostingClassifier, GradientBoostingRegressor, DecisionTreeRegressor, RandomForestRegressor, SGDClassifier, SGDRegressor, AdaBoostRegressor, AdaBoostClassifier, LinearSVR, Lasso, Ridge, ElasticNet, SVC)): self._model_kwargs['random_state'] = 123456789 if sampler_kwargs: self._sampler_kwargs.update(sampler_kwargs) if sampler: sampler_type = globals()[sampler] if issubclass(sampler_type, (Nystroem, RBFSampler, SkewedChi2Sampler)): self._sampler_kwargs['random_state'] = 123456789 self.sampler = sampler_type(**self._sampler_kwargs) else: self.sampler = None if model_kwargs: # if the model is an AdaBoost classifier or regressor, then we # need to convert any specified `base_estimator` (a string) # into an object before passing it in to the learner constructor. # we also need to make sure that if the base estimator is # anything other than MultinomialNB, we set the random state # to a fixed seed such that results are replicable if issubclass(self._model_type, (AdaBoostRegressor, AdaBoostClassifier)) and ('base_estimator' in model_kwargs): base_estimator_name = model_kwargs['base_estimator'] base_estimator_kwargs = {} if base_estimator_name in ['MultinomialNB', 'SVR'] else {'random_state': 123456789} base_estimator = globals()[base_estimator_name](**base_estimator_kwargs) model_kwargs['base_estimator'] = base_estimator self._model_kwargs.update(model_kwargs) @classmethod
[docs] def from_file(cls, learner_path): """ :returns: New instance of Learner from the pickle at the specified path. """ skll_version, learner = joblib.load(learner_path) # For backward compatibility, convert string model types to labels. if isinstance(learner._model_type, string_types): learner._model_type = globals()[learner._model_type] # Check that we've actually loaded a Learner (or sub-class) if not isinstance(learner, cls): raise ValueError(('The pickle stored at {} does not contain ' + 'a {} object.').format(learner_path, cls)) # Check that versions are compatible. (Currently, this just checks # that major versions match) elif skll_version >= (0, 9, 17): if not hasattr(learner, 'sampler'): learner.sampler = None # From v0.17.0 onwards, scikit-learn requires all scalers to have # the `scale_` instead of the `std_` parameter. So, we need to # make all old models adapt to this. if hasattr(learner, 'scaler'): new_scaler = copy.copy(learner.scaler) # We need to use `__dict__` because the `std_` has been # overridden to just return the `scale_` value, and we # need the original value of `std_`. if (not hasattr(new_scaler, 'scale_') and 'std_' in new_scaler.__dict__): new_scaler.scale_ = new_scaler.__dict__['std_'] learner.scaler = new_scaler return learner else: raise ValueError(("{} stored in pickle file {} was " + "created with version {} of SKLL, which is " + "incompatible with the current version " + "{}").format(cls, learner_path, '.'.join(skll_version), '.'.join(VERSION)))
@property def model_type(self): """ The model type (i.e., the class) """ return self._model_type @property def model_kwargs(self): """ A dictionary of the underlying scikit-learn model's keyword arguments """ return self._model_kwargs @property def model(self): """ The underlying scikit-learn model """ return self._model
[docs] def load(self, learner_path): """ Replace the current learner instance with a saved learner. :param learner_path: The path to the file to load. :type learner_path: str """ del self.__dict__ self.__dict__ = Learner.from_file(learner_path).__dict__
@property def model_params(self): """ Model parameters (i.e., weights) for ``LinearModel`` (e.g., ``Ridge``) regression and liblinear models. :returns: Labeled weights and (labeled if more than one) intercept value(s) :rtype: tuple of (``weights``, ``intercepts``), where ``weights`` is a dict and ``intercepts`` is a dictionary """ res = {} intercept = None if (isinstance(self._model, LinearModel) or (isinstance(self._model, SVR) and self._model.kernel == 'linear') or isinstance(self._model, SGDRegressor)): # also includes RescaledRidge, RescaledSVR, RescaledSGDRegressor coef = self.model.coef_ intercept = {'_intercept_': self.model.intercept_} # convert SVR coefficient format (1 x matrix) to array if isinstance(self._model, SVR): coef = coef.toarray()[0] # inverse transform to get indices for before feature selection coef = coef.reshape(1, -1) coef = self.feat_selector.inverse_transform(coef)[0] for feat, idx in iteritems(self.feat_vectorizer.vocabulary_): if coef[idx]: res[feat] = coef[idx] elif isinstance(self._model, LinearSVC) or isinstance(self._model, LogisticRegression): label_list = self.label_list # if there are only two labels, scikit-learn will only have one # set of parameters and they will be associated with label 1 (not # 0) if len(self.label_list) == 2: label_list = self.label_list[-1:] for i, label in enumerate(label_list): coef = self.model.coef_[i] coef = self.feat_selector.inverse_transform(coef)[0] for feat, idx in iteritems(self.feat_vectorizer.vocabulary_): if coef[idx]: res['{}\t{}'.format(label, feat)] = coef[idx] if isinstance(self.model.intercept_, float): intercept = {'_intercept_': self.model.intercept_} elif self.model.intercept_.any(): intercept = dict(zip(label_list, self.model.intercept_)) else: # not supported raise ValueError(("{} is not supported by" + " model_params with its current settings." ).format(self._model_type)) return res, intercept @property def probability(self): """ Should learner return probabilities of all labels (instead of just label with highest probability)? """ return self._probability @probability.setter def probability(self, value): # LinearSVC doesn't support predict_proba self._probability = value if not hasattr(self.model_type, "predict_proba") and value: logger = logging.getLogger(__name__) logger.warning(("probability was set to True, but {} does not have" " a predict_proba() method.") .format(self.model_type)) self._probability = False
[docs] def save(self, learner_path): """ Save the learner to a file. :param learner_path: The path to where you want to save the learner. :type learner_path: str """ # create the directory if it doesn't exist learner_dir = os.path.dirname(learner_path) if not os.path.exists(learner_dir): os.makedirs(learner_dir) # write out the files joblib.dump((VERSION, self), learner_path)
def _create_estimator(self): """ :returns: A tuple containing an instantiation of the requested estimator, and a parameter grid to search. """ estimator = None default_param_grid = _find_default_param_grid(self._model_type) if default_param_grid is None: raise ValueError("%s is not a valid learner type." % (self._model_type,)) estimator = self._model_type(**self._model_kwargs) return estimator, default_param_grid def _check_input_formatting(self, examples): """ check that the examples are properly formatted. """ # Make sure the labels for a regression task are not strings. if self.model_type._estimator_type == 'regressor': for label in examples.labels: if isinstance(label, string_types): raise TypeError("You are doing regression with string " "labels. Convert them to integers or " "floats.") # make sure that feature values are not strings for val in examples.features.data: if isinstance(val, string_types): raise TypeError("You have feature values that are strings. " "Convert them to floats.") @staticmethod def _check_max_feature_value(feat_array): """ Check if the the maximum absolute value of any feature is too large """ max_feat_abs = np.max(np.abs(feat_array.data)) if max_feat_abs > 1000.0: logger = logging.getLogger(__name__) logger.warning(("You have a feature with a very large absolute " + "value (%s). That may cause the learning " + "algorithm to crash or perform " + "poorly."), max_feat_abs) def _create_label_dict(self, examples): """ Creates a dictionary of labels for classification problems. :param examples: The examples to use for training. :type examples: FeatureSet """ # We don't need to do this for regression models, so return. if self.model_type._estimator_type == 'regressor': return # extract list of unique labels if we are doing classification self.label_list = np.unique(examples.labels).tolist() # if one label is specified as the positive class, make sure it's # last if self.pos_label_str: self.label_list = sorted(self.label_list, key=lambda x: (x == self.pos_label_str, x)) # Given a list of all labels in the dataset and a list of the # unique labels in the set, convert the first list to an array of # numbers. self.label_dict = {label: i for i, label in enumerate(self.label_list)} def _train_setup(self, examples): """ Set up the feature vectorizer and the scaler. :param examples: The examples to use for training. :type examples: FeatureSet """ # Check feature values and labels self._check_input_formatting(examples) # Create feature name -> value mapping self.feat_vectorizer = examples.vectorizer # initialize feature selector self.feat_selector = SelectByMinCount( min_count=self._min_feature_count) # Create scaler if we weren't passed one and it's necessary if not issubclass(self._model_type, MultinomialNB): if self._feature_scaling != 'none': scale_with_mean = self._feature_scaling in { 'with_mean', 'both'} scale_with_std = self._feature_scaling in {'with_std', 'both'} self.scaler = StandardScaler(copy=True, with_mean=scale_with_mean, with_std=scale_with_std) else: # Doing this is to prevent any modification of feature values # using a dummy transformation self.scaler = StandardScaler(copy=False, with_mean=False, with_std=False)
[docs] def train(self, examples, param_grid=None, grid_search_folds=3, grid_search=True, grid_objective='f1_score_micro', grid_jobs=None, shuffle=False, create_label_dict=True): """ Train a classification model and return the model, score, feature vectorizer, scaler, label dictionary, and inverse label dictionary. :param examples: The examples to train the model on. :type examples: FeatureSet :param param_grid: The parameter grid to search through for grid search. If unspecified, a default parameter grid will be used. :type param_grid: list of dicts mapping from strs to lists of parameter values :param grid_search_folds: The number of folds to use when doing the grid search, or a mapping from example IDs to folds. :type grid_search_folds: int or dict :param grid_search: Should we do grid search? :type grid_search: bool :param grid_objective: The objective function to use when doing the grid search. :type grid_objective: function :param grid_jobs: The number of jobs to run in parallel when doing the grid search. If unspecified or 0, the number of grid search folds will be used. :type grid_jobs: int :param shuffle: Shuffle examples (e.g., for grid search CV.) :type shuffle: bool :param create_label_dict: Should we create the label dictionary? This dictionary is used to map between string labels and their corresponding numerical values. This should only be done once per experiment, so when ``cross_validate`` calls ``train``, ``create_label_dict`` gets set to ``False``. :type create_label_dict: bool :return: The best grid search objective function score, or 0 if we're not doing grid search. :rtype: float """ logger = logging.getLogger(__name__) # if we are asked to do grid search, check that the grid objective # function is valid for the selected learner if grid_search: if self.model_type._estimator_type == 'regressor': # types 2-4 are valid for all regression models if grid_objective in _CLASSIFICATION_ONLY_OBJ_FUNCS: raise ValueError("{} is not a valid grid objective " "function for the {} learner" .format(grid_objective, self._model_type)) elif grid_objective not in _CLASSIFICATION_ONLY_OBJ_FUNCS: # This is a classifier. Valid objective functions depend on # type of label (int, string, binary) if issubclass(examples.labels.dtype.type, int): # If they're ints, class 1 and 2 are valid for classifiers, if grid_objective not in _INT_CLASS_OBJ_FUNCS: raise ValueError("{} is not a valid grid objective " "function for the {} learner with " "integer labels" .format(grid_objective, self._model_type)) elif issubclass(examples.labels.dtype.type, str): # if all of the labels are strings, only class 1 objectives # are valid (with a classifier). raise ValueError("{} is not a valid grid objective " "function for the {} learner with string " "labels".format(grid_objective, self._model_type)) elif len(set(examples.labels)) == 2: # If there are two labels, class 3 objectives are valid for # classifiers regardless of the type of the label. if grid_objective not in _BINARY_CLASS_OBJ_FUNCS: raise ValueError("{} is not a valid grid objective " "function for the {} learner with " "binary labels" .format(grid_objective, self._model_type)) elif grid_objective in _REGRESSION_ONLY_OBJ_FUNCS: # simple backoff check for mixed-type labels raise ValueError("{} is not a valid grid objective " "function for the {} learner" .format(grid_objective, self._model_type)) # Shuffle so that the folds are random for the inner grid search CV. # If grid search is True but shuffle isn't, shuffle anyway. # You can't shuffle a scipy sparse matrix in place, so unfortunately # we make a copy of everything (and then get rid of the old version) if grid_search or shuffle: if grid_search and not shuffle: logger.warning('Training data will be shuffled to randomize ' 'grid search folds. Shuffling may yield ' 'different results compared to scikit-learn.') ids, labels, features = sk_shuffle(examples.ids, examples.labels, examples.features, random_state=123456789) examples = FeatureSet(examples.name, ids, labels=labels, features=features, vectorizer=examples.vectorizer) # call train setup to set up the vectorizer, the labeldict, and the # scaler if create_label_dict: self._create_label_dict(examples) self._train_setup(examples) # select features xtrain = self.feat_selector.fit_transform(examples.features) # Convert to dense if necessary if self._use_dense_features: try: xtrain = xtrain.todense() except MemoryError: if issubclass(self._model_type, _REQUIRES_DENSE): reason = ('{} does not support sparse ' + 'matrices.').format(self._model_type) else: reason = ('{} feature scaling requires a dense ' + 'matrix.').format(self._feature_scaling) raise MemoryError('Ran out of memory when converting training ' 'data to dense. This was required because ' + reason) if isinstance(self.feat_vectorizer, FeatureHasher) and \ issubclass(self._model_type, MultinomialNB): raise ValueError('Cannot use FeatureHasher with MultinomialNB, ' 'because MultinomialNB cannot handle negative ' 'feature values.') # Scale features if necessary if not issubclass(self._model_type, MultinomialNB): xtrain = self.scaler.fit_transform(xtrain) # check whether any feature values are too large self._check_max_feature_value(xtrain) # Sampler if self.sampler: logger.warning('Sampler converts sparse matrix to dense') if isinstance(self.sampler, SkewedChi2Sampler): logger.warning('SkewedChi2Sampler uses a dense matrix') xtrain = self.sampler.fit_transform(xtrain.todense()) else: xtrain = self.sampler.fit_transform(xtrain) # Instantiate an estimator and get the default parameter grid to search estimator, default_param_grid = self._create_estimator() # use label dict transformed version of examples.labels if doing # classification if self.model_type._estimator_type == 'classifier': labels = np.array([self.label_dict[label] for label in examples.labels]) else: labels = examples.labels # set up a grid searcher if we are asked to if grid_search: # set up grid search folds if isinstance(grid_search_folds, int): grid_search_folds = \ self._compute_num_folds_from_example_counts( grid_search_folds, labels) if not grid_jobs: grid_jobs = grid_search_folds else: grid_jobs = min(grid_search_folds, grid_jobs) folds = grid_search_folds else: # use the number of unique fold IDs as the number of grid jobs if not grid_jobs: grid_jobs = len(np.unique(grid_search_folds)) else: grid_jobs = min(len(np.unique(grid_search_folds)), grid_jobs) # Only retain IDs within folds if they're in grid_search_folds dummy_label = next(itervalues(grid_search_folds)) fold_groups = [grid_search_folds.get(curr_id, dummy_label) for curr_id in examples.ids] folds = FilteredLeaveOneGroupOut(grid_search_folds, examples.ids).split(examples.features, examples.labels, fold_groups) # Use default parameter grid if we weren't passed one if not param_grid: param_grid = default_param_grid # If we're using a correlation metric for doing binary # classification, override the estimator's predict function if (grid_objective in _CORRELATION_METRICS and self.model_type._estimator_type == 'classifier'): estimator.predict_normal = estimator.predict estimator.predict = _predict_binary # limit the number of grid_jobs to be no higher than five or the # number of cores for the machine, whichever is lower grid_jobs = min(grid_jobs, cpu_count(), MAX_CONCURRENT_PROCESSES) grid_searcher = GridSearchCV(estimator, param_grid, scoring=grid_objective, cv=folds, n_jobs=grid_jobs, pre_dispatch=grid_jobs) # run the grid search for hyperparameters grid_searcher.fit(xtrain, labels) self._model = grid_searcher.best_estimator_ grid_score = grid_searcher.best_score_ else: self._model = estimator.fit(xtrain, labels) grid_score = 0.0 return grid_score
[docs] def evaluate(self, examples, prediction_prefix=None, append=False, grid_objective=None): """ Evaluates a given model on a given dev or test example set. :param examples: The examples to evaluate the performance of the model on. :type examples: FeatureSet :param prediction_prefix: If saving the predictions, this is the prefix that will be used for the filename. It will be followed by ".predictions" :type prediction_prefix: str :param append: Should we append the current predictions to the file if it exists? :type append: bool :param grid_objective: The objective function that was used when doing the grid search. :type grid_objective: function :return: The confusion matrix, the overall accuracy, the per-label PRFs, the model parameters, and the grid search objective function score. :rtype: 5-tuple """ # initialize grid score grid_score = None # make the prediction on the test data yhat = self.predict(examples, prediction_prefix=prediction_prefix, append=append) # extract actual labels (transformed for classification tasks) if self.model_type._estimator_type == 'classifier': test_label_list = np.unique(examples.labels).tolist() # identify unseen test labels if any and add a new dictionary for these # labels unseen_test_label_list = [label for label in test_label_list if not label in self.label_list] unseen_label_dict = {label: i for i, label in enumerate(unseen_test_label_list, start=len(self.label_list))} # combine the two dictionaries train_and_test_label_dict = self.label_dict.copy() train_and_test_label_dict.update(unseen_label_dict) ytest = np.array([train_and_test_label_dict[label] for label in examples.labels]) else: ytest = examples.labels # if run in probability mode, convert yhat to list of labels predicted if self.probability: # if we're using a correlation grid objective, calculate it here if grid_objective and grid_objective in _CORRELATION_METRICS: try: grid_score = use_score_func(grid_objective, ytest, yhat[:, 1]) except ValueError: grid_score = float('NaN') yhat = np.array([max(range(len(row)), key=lambda i: row[i]) for row in yhat]) # calculate grid search objective function score, if specified if (grid_objective and (grid_objective not in _CORRELATION_METRICS or not self.probability)): try: grid_score = use_score_func(grid_objective, ytest, yhat) except ValueError: grid_score = float('NaN') if self.model_type._estimator_type == 'regressor': result_dict = {'descriptive': defaultdict(dict)} for table_label, y in zip(['actual', 'predicted'], [ytest, yhat]): result_dict['descriptive'][table_label]['min'] = min(y) result_dict['descriptive'][table_label]['max'] = max(y) result_dict['descriptive'][table_label]['avg'] = np.mean(y) result_dict['descriptive'][table_label]['std'] = np.std(y) result_dict['pearson'] = use_score_func('pearson', ytest, yhat) res = (None, None, result_dict, self._model.get_params(), grid_score) else: # compute the confusion matrix num_labels = len(train_and_test_label_dict) conf_mat = confusion_matrix(ytest, yhat, labels=list(range(num_labels))) # Calculate metrics overall_accuracy = accuracy_score(ytest, yhat) result_matrix = precision_recall_fscore_support( ytest, yhat, labels=list(range(num_labels)), average=None) # Store results result_dict = defaultdict(dict) for actual_label in sorted(train_and_test_label_dict): col = train_and_test_label_dict[actual_label] result_dict[actual_label]["Precision"] = result_matrix[0][col] result_dict[actual_label]["Recall"] = result_matrix[1][col] result_dict[actual_label]["F-measure"] = result_matrix[2][col] res = (conf_mat.tolist(), overall_accuracy, result_dict, self._model.get_params(), grid_score) return res
[docs] def predict(self, examples, prediction_prefix=None, append=False, class_labels=False): """ Uses a given model to generate predictions on a given data set :param examples: The examples to predict the labels for. :type examples: FeatureSet :param prediction_prefix: If saving the predictions, this is the prefix that will be used for the filename. It will be followed by ".predictions" :type prediction_prefix: str :param append: Should we append the current predictions to the file if it exists? :type append: bool :param class_labels: For classifier, should we convert class indices to their (str) labels? :type class_labels: bool :return: The predictions returned by the learner. :rtype: array """ logger = logging.getLogger(__name__) example_ids = examples.ids # Need to do some transformations so the features are in the right # columns for the test set. Obviously a bit hacky, but storing things # in sparse matrices saves memory over our old list of dicts approach. if isinstance(self.feat_vectorizer, FeatureHasher): if (self.feat_vectorizer.n_features != examples.vectorizer.n_features): logger.warning("There is mismatch between the training model " "features and the data passed to predict.") self_feat_vec_tuple = (self.feat_vectorizer.dtype, self.feat_vectorizer.input_type, self.feat_vectorizer.n_features, self.feat_vectorizer.non_negative) example_feat_vec_tuple = (examples.vectorizer.dtype, examples.vectorizer.input_type, examples.vectorizer.n_features, examples.vectorizer.non_negative) if self_feat_vec_tuple == example_feat_vec_tuple: xtest = examples.features else: xtest = self.feat_vectorizer.transform( examples.vectorizer.inverse_transform( examples.features)) else: if (set(self.feat_vectorizer.feature_names_) != set(examples.vectorizer.feature_names_)): logger.warning("There is mismatch between the training model " "features and the data passed to predict.") if self.feat_vectorizer == examples.vectorizer: xtest = examples.features else: xtest = self.feat_vectorizer.transform( examples.vectorizer.inverse_transform( examples.features)) # filter features based on those selected from training set xtest = self.feat_selector.transform(xtest) # Sampler if self.sampler: logger.warning('Sampler converts sparse matrix to dense') if isinstance(self.sampler, SkewedChi2Sampler): logger.warning('SkewedChi2Sampler uses a dense matrix') xtest = self.sampler.fit_transform(xtest.todense()) else: xtest = self.sampler.fit_transform(xtest) # Convert to dense if necessary if self._use_dense_features and not isinstance(xtest, np.ndarray): try: xtest = xtest.todense() except MemoryError: if issubclass(self._model_type, _REQUIRES_DENSE): reason = ('{} does not support sparse ' + 'matrices.').format(self._model_type) else: reason = ('{} feature scaling requires a dense ' + 'matrix.').format(self._feature_scaling) raise MemoryError('Ran out of memory when converting test ' + 'data to dense. This was required because ' + reason) # Scale xtest if necessary if not issubclass(self._model_type, MultinomialNB): xtest = self.scaler.transform(xtest) # make the prediction on the test data try: yhat = (self._model.predict_proba(xtest) if (self.probability and not class_labels) else self._model.predict(xtest)) except NotImplementedError as e: logger.error("Model type: %s\nModel: %s\nProbability: %s\n", self._model_type, self._model, self.probability) raise e # write out the predictions if we are asked to if prediction_prefix is not None: prediction_file = '{}.predictions'.format(prediction_prefix) with open(prediction_file, "w" if not append else "a") as predictionfh: # header if not append: # Output probabilities if we're asked (and able) if self.probability: print('\t'.join(["id"] + [str(x) for x in self.label_list]), file=predictionfh) else: print('id\tprediction', file=predictionfh) if self.probability: for example_id, class_probs in zip(example_ids, yhat): print('\t'.join([str(example_id)] + [str(x) for x in class_probs]), file=predictionfh) else: if self.model_type._estimator_type == 'regressor': for example_id, pred in zip(example_ids, yhat): print('{0}\t{1}'.format(example_id, pred), file=predictionfh) else: for example_id, pred in zip(example_ids, yhat): print('%s\t%s' % (example_id, self.label_list[int(pred)]), file=predictionfh) if (class_labels and self.model_type._estimator_type == 'classifier'): yhat = np.array([self.label_list[int(pred)] for pred in yhat]) return yhat
def _compute_num_folds_from_example_counts(self, cv_folds, labels): """ Calculate the number of folds we should use for cross validation, based on the number of examples we have for each label. """ assert isinstance(cv_folds, int) # For regression models, we can just return the current cv_folds if self.model_type._estimator_type == 'regressor': return cv_folds min_examples_per_label = min(Counter(labels).values()) if min_examples_per_label <= 1: raise ValueError(('The training set has only {} example for a' + ' label.').format(min_examples_per_label)) if min_examples_per_label < cv_folds: logger = logging.getLogger(__name__) logger.warning('The minimum number of examples per label was %s. ' 'Setting the number of cross-validation folds to ' 'that value.', min_examples_per_label) cv_folds = min_examples_per_label return cv_folds
[docs] def cross_validate(self, examples, stratified=True, cv_folds=10, grid_search=False, grid_search_folds=3, grid_jobs=None, grid_objective='f1_score_micro', prediction_prefix=None, param_grid=None, shuffle=False, save_cv_folds=False): """ Cross-validates a given model on the training examples. :param examples: The data to cross-validate learner performance on. :type examples: FeatureSet :param stratified: Should we stratify the folds to ensure an even distribution of labels for each fold? :type stratified: bool :param cv_folds: The number of folds to use for cross-validation, or a mapping from example IDs to folds. :type cv_folds: int or dict :param grid_search: Should we do grid search when training each fold? Note: This will make this take *much* longer. :type grid_search: bool :param grid_search_folds: The number of folds to use when doing the grid search (ignored if cv_folds is set to a dictionary mapping examples to folds). :type grid_search_folds: int :param grid_jobs: The number of jobs to run in parallel when doing the grid search. If unspecified or 0, the number of grid search folds will be used. :type grid_jobs: int :param grid_objective: The objective function to use when doing the grid search. :type grid_objective: function :param param_grid: The parameter grid to search through for grid search. If unspecified, a default parameter grid will be used. :type param_grid: list of dicts mapping from strs to lists of parameter values :param prediction_prefix: If saving the predictions, this is the prefix that will be used for the filename. It will be followed by ".predictions" :type prediction_prefix: str :param shuffle: Shuffle examples before splitting into folds for CV. :type shuffle: bool :param save_cv_folds: Whether to save the cv fold ids or not :type save_cv_folds: bool :return: The confusion matrix, overall accuracy, per-label PRFs, and model parameters for each fold in one list, and another list with the grid search scores for each fold. Also return a dictionary containing the test-fold number for each id if save_cv_folds is True, otherwise None. :rtype: (list of 4-tuples, list of float, dict) """ # Seed the random number generator so that randomized algorithms are # replicable. random_state = np.random.RandomState(123456789) # Set up logger. logger = logging.getLogger(__name__) # Shuffle so that the folds are random for the inner grid search CV. # If grid search is True but shuffle isn't, shuffle anyway. # You can't shuffle a scipy sparse matrix in place, so unfortunately # we make a copy of everything (and then get rid of the old version) if grid_search or shuffle: if grid_search and not shuffle: logger.warning('Training data will be shuffled to randomize ' 'grid search folds. Shuffling may yield ' 'different results compared to scikit-learn.') ids, labels, features = sk_shuffle(examples.ids, examples.labels, examples.features, random_state=random_state) examples = FeatureSet(examples.name, ids, labels=labels, features=features, vectorizer=examples.vectorizer) # call train setup self._create_label_dict(examples) self._train_setup(examples) # Set up the cross-validation iterator. if isinstance(cv_folds, int): cv_folds = self._compute_num_folds_from_example_counts( cv_folds, examples.labels) stratified = (stratified and self.model_type._estimator_type == 'classifier') if stratified: kfold = StratifiedKFold(n_splits=cv_folds) cv_groups = None else: kfold = KFold(n_splits=cv_folds, random_state=random_state) cv_groups = None # Otherwise cv_volds is a dict else: # if we have a mapping from IDs to folds, use it for the overall # cross-validation as well as the grid search within each # training fold. Note that this means that the grid search # will use K-1 folds because the Kth will be the test fold for # the outer cross-validation. # Only retain IDs within folds if they're in grid_search_folds dummy_label = next(itervalues(cv_folds)) fold_groups = [cv_folds.get(curr_id, dummy_label) for curr_id in examples.ids] # Only retain IDs within folds if they're in cv_folds kfold = FilteredLeaveOneGroupOut(cv_folds, examples.ids) cv_groups = fold_groups grid_search_folds = cv_folds # Save the cross-validation fold information, if required # The format is that the test-fold that each id appears in is stored skll_fold_ids = None if save_cv_folds: skll_fold_ids = {} for fold_num, (_, test_indices) in enumerate(kfold.split(examples.features, examples.labels, cv_groups)): for index in test_indices: skll_fold_ids[examples.ids[index]] = str(fold_num) # handle each fold separately and accumulate the predictions and the # numbers results = [] grid_search_scores = [] append_predictions = False for train_index, test_index in kfold.split(examples.features, examples.labels, cv_groups): # Train model self._model = None # prevent feature vectorizer from being reset. train_set = FeatureSet(examples.name, examples.ids[train_index], labels=examples.labels[train_index], features=examples.features[train_index], vectorizer=examples.vectorizer) # Set run_create_label_dict to False since we already created the # label dictionary for the whole dataset above. grid_search_score = self.train(train_set, grid_search_folds=grid_search_folds, grid_search=grid_search, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_jobs, shuffle=grid_search, create_label_dict=False) grid_search_scores.append(grid_search_score) # note: there is no need to shuffle again within each fold, # regardless of what the shuffle keyword argument is set to. # Evaluate model test_tuple = FeatureSet(examples.name, examples.ids[test_index], labels=examples.labels[test_index], features=examples.features[test_index], vectorizer=examples.vectorizer) results.append(self.evaluate(test_tuple, prediction_prefix=prediction_prefix, append=append_predictions, grid_objective=grid_objective)) append_predictions = True # return list of results for all folds return results, grid_search_scores, skll_fold_ids
[docs] def learning_curve(self, examples, cv_folds=10, train_sizes=np.linspace(0.1, 1.0, 5), objective='f1_score_micro'): """ Generates learning curves for a given model on the training examples via cross-validation. Adapted from the scikit-learn code for learning curve generation (cf. ```sklearn.model_selection.learning_curve```). :param examples: The data to generate the learning curve on. :type examples: skll.data.FeatureSet :param cv_folds: The number of folds to use for cross-validation with each training size :type cv_folds: int :param train_sizes: Relative or absolute numbers of training examples that will be used to generate the learning curve. If the type is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: `np.linspace(0.1, 1.0, 5)`) :type train_sizes: list of float or int :param objective: The name of the objective function to use when computing the train and test scores for the learning curve. (default: 'f1_score_micro') :type objective: string :return: The scores on the training sets, the scores on the test set, and the numbers of training examples used to generate the curve. :rtype: (list of float, list of float, list of int) """ # Seed the random number generator so that randomized algorithms are # replicable. random_state = np.random.RandomState(123456789) # Set up logger. logger = logging.getLogger(__name__) # Call train setup before since we need to train # the learner eventually self._create_label_dict(examples) self._train_setup(examples) # Set up the cross-validation iterator with 20% of the data # always reserved for testing cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state) cv_iter = list(cv.split(examples.features, examples.labels, None)) n_max_training_samples = len(cv_iter[0][0]) # Get the _translate_train_sizes() function from scikit-learn # since we need it to get the right list of sizes after cross-validation _module = import_module('sklearn.model_selection._validation') _translate_train_sizes = getattr(_module, '_translate_train_sizes') train_sizes_abs = _translate_train_sizes(train_sizes, n_max_training_samples) n_unique_ticks = train_sizes_abs.shape[0] # Create an iterator over train/test featuresets based on the # cross-validation index iterator featureset_iter = (FeatureSet.split_by_ids(examples, train, test) for train, test in cv_iter) # Limit the number of parallel jobs for this # to be no higher than five or the number of cores # for the machine, whichever is lower n_jobs = min(cpu_count(), MAX_CONCURRENT_PROCESSES) # Run jobs in parallel that train the model on each subset # of the training data and compute train and test scores parallel = joblib.Parallel(n_jobs=n_jobs, pre_dispatch=n_jobs) out = parallel(joblib.delayed(_train_and_score)(self, train_fs[:n_train_samples], test_fs, objective) for train_fs, test_fs in featureset_iter for n_train_samples in train_sizes_abs) # Reshape the outputs out = np.array(out) n_cv_folds = out.shape[0] // n_unique_ticks out = out.reshape(n_cv_folds, n_unique_ticks, 2) out = np.asarray(out).transpose((2, 1, 0)) return list(out[0]), list(out[1]), list(train_sizes_abs)