'''
The :mod:`model_selection.split<surprise.model_selection.split>` module
contains various cross-validation iterators. Design and tools are inspired from
the mighty scikit learn.
The available iterators are:
.. autosummary::
:nosignatures:
KFold
RepeatedKFold
ShuffleSplit
LeaveOneOut
PredefinedKFold
This module also contains a function for splitting datasets into trainset and
testset:
.. autosummary::
:nosignatures:
train_test_split
'''
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from itertools import chain
from math import ceil, floor
import numbers
from collections import defaultdict
from six import iteritems
from six import string_types
import numpy as np
from ..utils import get_rng
def get_cv(cv):
'''Return a 'validated' CV iterator.'''
if cv is None:
return KFold(n_splits=5)
if isinstance(cv, numbers.Integral):
return KFold(n_splits=cv)
if hasattr(cv, 'split') and not isinstance(cv, string_types):
return cv # str have split
raise ValueError('Wrong CV object. Expecting None, an int or CV iterator, '
'got a {}'.format(type(cv)))
[docs]class KFold():
'''A basic cross-validation iterator.
Each fold is used once as a testset while the k - 1 remaining folds are
used for training.
See an example in the :ref:`User Guide <use_cross_validation_iterators>`.
Args:
n_splits(int): The number of folds.
random_state(int, RandomState instance from numpy, or ``None``):
Determines the RNG that will be used for determining the folds. If
int, ``random_state`` will be used as a seed for a new RNG. This is
useful to get the same splits over multiple calls to ``split()``.
If RandomState instance, this same instance is used as RNG. If
``None``, the current RNG from numpy is used. ``random_state`` is
only used if ``shuffle`` is ``True``. Default is ``None``.
shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
of the ``split()`` method. Shuffling is not done in-place. Default
is ``True``.
'''
def __init__(self, n_splits=5, random_state=None, shuffle=True):
self.n_splits = n_splits
self.shuffle = shuffle
self.random_state = random_state
[docs] def split(self, data):
'''Generator function to iterate over trainsets and testsets.
Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
Yields:
tuple of (trainset, testset)
'''
if self.n_splits > len(data.raw_ratings) or self.n_splits < 2:
raise ValueError('Incorrect value for n_splits={0}. '
'Must be >=2 and less than the number '
'of ratings'.format(len(data.raw_ratings)))
# We use indices to avoid shuffling the original data.raw_ratings list.
indices = np.arange(len(data.raw_ratings))
if self.shuffle:
get_rng(self.random_state).shuffle(indices)
start, stop = 0, 0
for fold_i in range(self.n_splits):
start = stop
stop += len(indices) // self.n_splits
if fold_i < len(indices) % self.n_splits:
stop += 1
raw_trainset = [data.raw_ratings[i] for i in chain(indices[:start],
indices[stop:])]
raw_testset = [data.raw_ratings[i] for i in indices[start:stop]]
trainset = data.construct_trainset(raw_trainset)
testset = data.construct_testset(raw_testset)
yield trainset, testset
def get_n_folds(self):
return self.n_splits
[docs]class RepeatedKFold():
'''
Repeated :class:`KFold` cross validator.
Repeats :class:`KFold` n times with different randomization in each
repetition.
See an example in the :ref:`User Guide <use_cross_validation_iterators>`.
Args:
n_splits(int): The number of folds.
n_repeats(int): The number of repetitions.
random_state(int, RandomState instance from numpy, or ``None``):
Determines the RNG that will be used for determining the folds. If
int, ``random_state`` will be used as a seed for a new RNG. This is
useful to get the same splits over multiple calls to ``split()``.
If RandomState instance, this same instance is used as RNG. If
``None``, the current RNG from numpy is used. ``random_state`` is
only used if ``shuffle`` is ``True``. Default is ``None``.
shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
of the ``split()`` method. Shuffling is not done in-place. Default
is ``True``.
'''
def __init__(self, n_splits=5, n_repeats=10, random_state=None):
self.n_repeats = n_repeats
self.random_state = random_state
self.n_splits = n_splits
[docs] def split(self, data):
'''Generator function to iterate over trainsets and testsets.
Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
Yields:
tuple of (trainset, testset)
'''
rng = get_rng(self.random_state)
for _ in range(self.n_repeats):
cv = KFold(n_splits=self.n_splits, random_state=rng, shuffle=True)
for trainset, testset in cv.split(data):
yield trainset, testset
def get_n_folds(self):
return self.n_repeats * self.n_splits
[docs]class ShuffleSplit():
'''A basic cross-validation iterator with random trainsets and testsets.
Contrary to other cross-validation strategies, random splits do not
guarantee that all folds will be different, although this is still very
likely for sizeable datasets.
See an example in the :ref:`User Guide <use_cross_validation_iterators>`.
Args:
n_splits(int): The number of folds.
test_size(float or int ``None``): If float, it represents the
proportion of ratings to include in the testset. If int,
represents the absolute number of ratings in the testset. If
``None``, the value is set to the complement of the trainset size.
Default is ``.2``.
train_size(float or int or ``None``): If float, it represents the
proportion of ratings to include in the trainset. If int,
represents the absolute number of ratings in the trainset. If
``None``, the value is set to the complement of the testset size.
Default is ``None``.
random_state(int, RandomState instance from numpy, or ``None``):
Determines the RNG that will be used for determining the folds. If
int, ``random_state`` will be used as a seed for a new RNG. This is
useful to get the same splits over multiple calls to ``split()``.
If RandomState instance, this same instance is used as RNG. If
``None``, the current RNG from numpy is used. ``random_state`` is
only used if ``shuffle`` is ``True``. Default is ``None``.
shuffle(bool): Whether to shuffle the ratings in the ``data`` parameter
of the ``split()`` method. Shuffling is not done in-place. Setting
this to `False` defeats the purpose of this iterator, but it's
useful for the implementation of :func:`train_test_split`. Default
is ``True``.
'''
def __init__(self, n_splits=5, test_size=.2, train_size=None,
random_state=None, shuffle=True):
if n_splits <= 0:
raise ValueError('n_splits = {0} should be strictly greater than '
'0.'.format(n_splits))
if test_size is not None and test_size <= 0:
raise ValueError('test_size={0} should be strictly greater than '
'0'.format(test_size))
if train_size is not None and train_size <= 0:
raise ValueError('train_size={0} should be strictly greater than '
'0'.format(train_size))
self.n_splits = n_splits
self.test_size = test_size
self.train_size = train_size
self.random_state = random_state
self.shuffle = shuffle
def validate_train_test_sizes(self, test_size, train_size, n_ratings):
if test_size is not None and test_size >= n_ratings:
raise ValueError('test_size={0} should be less than the number of '
'ratings {1}'.format(test_size, n_ratings))
if train_size is not None and train_size >= n_ratings:
raise ValueError('train_size={0} should be less than the number of'
' ratings {1}'.format(train_size, n_ratings))
if np.asarray(test_size).dtype.kind == 'f':
test_size = ceil(test_size * n_ratings)
if train_size is None:
train_size = n_ratings - test_size
elif np.asarray(train_size).dtype.kind == 'f':
train_size = floor(train_size * n_ratings)
if test_size is None:
test_size = n_ratings - train_size
if train_size + test_size > n_ratings:
raise ValueError('The sum of train_size and test_size ({0}) '
'should be smaller than the number of '
'ratings {1}.'.format(train_size + test_size,
n_ratings))
return int(train_size), int(test_size)
[docs] def split(self, data):
'''Generator function to iterate over trainsets and testsets.
Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
Yields:
tuple of (trainset, testset)
'''
test_size, train_size = self.validate_train_test_sizes(
self.test_size, self.train_size, len(data.raw_ratings))
rng = get_rng(self.random_state)
for _ in range(self.n_splits):
if self.shuffle:
permutation = rng.permutation(len(data.raw_ratings))
else:
permutation = np.arange(len(data.raw_ratings))
raw_trainset = [data.raw_ratings[i] for i in
permutation[:test_size]]
raw_testset = [data.raw_ratings[i] for i in
permutation[test_size:(test_size + train_size)]]
trainset = data.construct_trainset(raw_trainset)
testset = data.construct_testset(raw_testset)
yield trainset, testset
def get_n_folds(self):
return self.n_splits
[docs]def train_test_split(data, test_size=.2, train_size=None, random_state=None,
shuffle=True):
'''Split a dataset into trainset and testset.
See an example in the :ref:`User Guide <train_test_split_example>`.
Note: this function cannot be used as a cross-validation iterator.
Args:
data(:obj:`Dataset <surprise.dataset.Dataset>`): The dataset to split
into trainset and testset.
test_size(float or int ``None``): If float, it represents the
proportion of ratings to include in the testset. If int,
represents the absolute number of ratings in the testset. If
``None``, the value is set to the complement of the trainset size.
Default is ``.2``.
train_size(float or int or ``None``): If float, it represents the
proportion of ratings to include in the trainset. If int,
represents the absolute number of ratings in the trainset. If
``None``, the value is set to the complement of the testset size.
Default is ``None``.
random_state(int, RandomState instance from numpy, or ``None``):
Determines the RNG that will be used for determining the folds. If
int, ``random_state`` will be used as a seed for a new RNG. This is
useful to get the same splits over multiple calls to ``split()``.
If RandomState instance, this same instance is used as RNG. If
``None``, the current RNG from numpy is used. ``random_state`` is
only used if ``shuffle`` is ``True``. Default is ``None``.
shuffle(bool): Whether to shuffle the ratings in the ``data``
parameter. Shuffling is not done in-place. Default is ``True``.
'''
ss = ShuffleSplit(n_splits=1, test_size=test_size, train_size=train_size,
random_state=random_state, shuffle=shuffle)
return next(ss.split(data))
[docs]class LeaveOneOut():
'''Cross-validation iterator where each user has exactly one rating in the
testset.
Contrary to other cross-validation strategies, ``LeaveOneOut`` does not
guarantee that all folds will be different, although this is still very
likely for sizeable datasets.
See an example in the :ref:`User Guide <use_cross_validation_iterators>`.
Args:
n_splits(int): The number of folds.
random_state(int, RandomState instance from numpy, or ``None``):
Determines the RNG that will be used for determining the folds. If
int, ``random_state`` will be used as a seed for a new RNG. This is
useful to get the same splits over multiple calls to ``split()``.
If RandomState instance, this same instance is used as RNG. If
``None``, the current RNG from numpy is used. ``random_state`` is
only used if ``shuffle`` is ``True``. Default is ``None``.
min_n_ratings(int): Minimum number of ratings for each user in the
trainset. E.g. if ``min_n_ratings`` is ``2``, we are sure each user
has at least ``2`` ratings in the trainset (and ``1`` in the
testset). Other users are discarded. Default is ``0``, so some
users (having only one rating) may be in the testset and not in the
trainset.
'''
def __init__(self, n_splits=5, random_state=None, min_n_ratings=0):
self.n_splits = n_splits
self.random_state = random_state
self.min_n_ratings = min_n_ratings
[docs] def split(self, data):
'''Generator function to iterate over trainsets and testsets.
Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
Yields:
tuple of (trainset, testset)
'''
# map ratings to the users ids
user_ratings = defaultdict(list)
for uid, iid, r_ui, _ in data.raw_ratings:
user_ratings[uid].append((uid, iid, r_ui, None))
rng = get_rng(self.random_state)
for _ in range(self.n_splits):
# for each user, randomly choose a rating and put it in the
# testset.
raw_trainset, raw_testset = [], []
for uid, ratings in iteritems(user_ratings):
if len(ratings) > self.min_n_ratings:
i = rng.randint(0, len(ratings))
raw_testset.append(ratings[i])
raw_trainset += [rating for (j, rating)
in enumerate(ratings) if j != i]
if not raw_trainset:
raise ValueError('Could not build any trainset. Maybe '
'min_n_ratings is too high?')
trainset = data.construct_trainset(raw_trainset)
testset = data.construct_testset(raw_testset)
yield trainset, testset
def get_n_folds(self):
return self.n_splits
[docs]class PredefinedKFold():
'''A cross-validation iterator to when a dataset has been loaded with the
:meth:`load_from_folds <surprise.dataset.Dataset.load_from_folds>`
method.
See an example in the :ref:`User Guide <load_from_folds_example>`.
'''
[docs] def split(self, data):
'''Generator function to iterate over trainsets and testsets.
Args:
data(:obj:`Dataset<surprise.dataset.Dataset>`): The data containing
ratings that will be devided into trainsets and testsets.
Yields:
tuple of (trainset, testset)
'''
self.n_splits = len(data.folds_files)
for train_file, test_file in data.folds_files:
raw_trainset = data.read_ratings(train_file)
raw_testset = data.read_ratings(test_file)
trainset = data.construct_trainset(raw_trainset)
testset = data.construct_testset(raw_testset)
yield trainset, testset
def get_n_folds(self):
return self.n_splits