Source code for skll.data.featureset

# License: BSD 3 clause
"""
Classes related to storing/merging feature sets.

:author: Dan Blanchard (dblanchard@ets.org)
:organization: ETS
"""

from __future__ import absolute_import, print_function, unicode_literals

from copy import deepcopy

import numpy as np
import scipy.sparse as sp
from six import iteritems
from six.moves import zip
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

from skll.data.dict_vectorizer import DictVectorizer as NewDictVectorizer


[docs]class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. .. warning:: FeatureSets can only be equal if the order of the instances is identical because these are stored as lists/arrays. This replaces ``ExamplesTuple`` from older versions. :param name: The name of this feature set. :type name: str :param ids: Example IDs for this set. :type ids: np.array :param labels: labels for this set. :type labels: np.array :param features: The features for each instance represented as either a list of dictionaries or an array-like (if `vectorizer` is also specified). :type features: list of dict or array-like :param vectorizer: Vectorizer which will be used to generate the feature matrix. :type vectorizer: DictVectorizer or FeatureHasher .. note:: If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError(('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats)) def __contains__(self, value): """ Check if example ID is in set """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. .. note:: We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ # We need to sort the indices for the underlying # feature sparse matrix in case we haven't done # so already. if not self.features.has_sorted_indices: self.features.sort_indices() if not other.features.has_sorted_indices: other.features.sort_indices() return (self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose(self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer) def __iter__(self): """ Iterate through (ID, label, feature_dict) tuples in feature set. """ if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): return self.features.shape[0] def __add__(self, other): """ Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet('+'.join(sorted([self.name, other.name])), deepcopy(self.ids)) # Combine feature matrices and vectorizers. if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names. if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features[relative_order]], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. if other.has_labels and \ not np.all(self.labels == other.labels[relative_order]): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') new_set.labels = deepcopy(self.labels) else: new_set.labels = deepcopy(other.labels[relative_order]) return new_set
[docs] def filter(self, ids=None, labels=None, features=None, inverse=False): """ Removes or keeps features and/or examples from the Featureset depending on the passed in parameters. :param ids: Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.labels = self.labels[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array(sorted({feat_num for feat_name, feat_num in iteritems(self.vectorizer.vocabulary_) if (feat_name in features or feat_name.split('=', 1)[0] in features)})) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not(np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True)
[docs] def filtered_iter(self, ids=None, labels=None, features=None, inverse=False): """ A version of ``__iter__`` that retains only the specified features and/or examples from the output. :param ids: Examples in the FeatureSet to keep. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features in the FeatureSet to keep. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = {name: value for name, value in iteritems(feat_dict) if (inverse != (name in features or name.split('=', 1)[0] in features))} elif not inverse: feat_dict = {} yield id_, label_, feat_dict
def __sub__(self, other): """ :returns: a copy of ``self`` with all features in ``other`` removed. """ new_set = deepcopy(self) new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ :returns: Whether or not this FeatureSet has any finite labels. """ if self.labels is not None: return not (np.issubdtype(self.labels.dtype, float) and np.isnan(np.min(self.labels))) else: return False def __str__(self): """ :returns: a string representation of FeatureSet """ return str(self.__dict__) def __repr__(self): """ :returns: a string representation of FeatureSet """ return repr(self.__dict__) def __getitem__(self, value): """ :returns: A specific example by row number, or if given a slice, a new FeatureSet containing a subset of the data. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = (self.features[value] if self.features is not None else None) sliced_labels = (self.labels[value] if self.labels is not None else None) return FeatureSet('{}_{}'.format(self.name, value), sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer) else: label = self.labels[value] if self.labels is not None else None feats = self.features[value, :] features = (self.vectorizer.inverse_transform(feats)[0] if self.features is not None else {}) return self.ids[value], label, features @staticmethod
[docs] def split_by_ids(fs, ids_for_split1, ids_for_split2=None): """ Split the given FeatureSet into two new FeatureSet instances based on the given IDs for the two splits. :param fs: The FeatureSet instance to split. :type fs: FeatureSet :param ids_for_split1: A list of example IDs which will be split out into the first FeatureSet instance. Note that the FeatureSet instance will respect the order of the specified IDs. :type ids_for_split1: list of int :param ids_for_split2: An optional ist of example IDs which will be split out into the second FeatureSet instance. Note that the FeatureSet instance will respect the order of the specified IDs. If this is not specified, then the second FeatureSet instance will contain the complement of the first set of IDs sorted in ascending order. :type ids_for_split2: list of int, optional """ # Note: an alternative way to implement this is to make copies # of the given FeatureSet instance and then use the `filter()` # method but that wastes too much memory since it requires making # two copies of the original FeatureSet which may be huge. With # the current implementation, we are creating new objects but # they should be much smaller than the original FeatureSet. ids1 = fs.ids[ids_for_split1] labels1 = fs.labels[ids_for_split1] features1 = fs.features[ids_for_split1] if ids_for_split2 is None: ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)] labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)] features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)] else: ids2 = fs.ids[ids_for_split2] labels2 = fs.labels[ids_for_split2] features2 = fs.features[ids_for_split2] fs1 = FeatureSet('{}_1'.format(fs.name), ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer) fs2 = FeatureSet('{}_2'.format(fs.name), ids2, labels=labels2, features=features2, vectorizer=fs.vectorizer) return fs1, fs2
@staticmethod
[docs] def from_data_frame(df, name, labels_column=None, vectorizer=None): """ Helper function to create a FeatureSet object from a `pandas.DataFrame`. Will raise an Exception if pandas is not installed in your environment. `FeatureSet` `ids` will be the index on `df`. :param df: The pandas.DataFrame object you'd like to use as a feature set. :type df: pandas.DataFrame :param name: The name of this feature set. :type name: str :param labels_column: The name of the column containing the labels (data to predict). :type labels_column: str or None :param vectorizer: Vectorizer which will be used to generate the feature matrix. :type vectorizer: DictVectorizer or FeatureHasher """ if labels_column: feature_columns = [column for column in df.columns if column != labels_column] labels = df[labels_column].tolist() else: feature_columns = df.columns labels = None features = df[feature_columns].to_dict(orient='records') return FeatureSet(name, ids=df.index.tolist(), labels=labels, features=features, vectorizer=vectorizer)