Source code for rsmtool.container

"""
Classes for storing any kind of data contained
in a pd.DataFrame object.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:date: 10/25/2017
:organization: ETS
"""


[docs]class DataContainer: """ A class to encapsulate datasets. """ def __init__(self, datasets=None): """ Initialize ``DataContainer`` object. Parameters ---------- datasets : list of dicts, optional A list of dataset dicts. Each dict should be in the following format: {'name': 'name_of_dataset', 'frame' <pd.DataFrame object>} """ self._names = [] self._dataframes = {} self._data_paths = {} if datasets is not None: for dataset_dict in datasets: self.add_dataset(dataset_dict, update=False)
[docs] @staticmethod def to_datasets(data_container): """ Convert a DataContainer object to a list of dataset dictionaries with keys {`name`, `path`, `frame`}. Parameters ---------- data_container : DataContainer A DataContainer object. Returns ------- datasets_dict : list of dicts A list of dataset dictionaries. """ dataset_dicts = [] for name in data_container.keys(): dataset_dict = {'name': name, 'path': data_container.get_path(name), 'frame': data_container.get_frame(name)} dataset_dicts.append(dataset_dict) return dataset_dicts
[docs] def add_dataset(self, dataset_dict, update=False): """ Update or add a new DataFrame to the instance. Parameters ---------- dataset_dict : pd.DataFrame The dataset dictionary to add. update : bool, optional Update an existing DataFrame, if True. Defaults to False. """ name = dataset_dict['name'] data_frame = dataset_dict['frame'] path = dataset_dict.get('path') if not update: if name in self._names: raise KeyError('The name {} already exists in the ' 'DataContainer dictionary.'.format(name)) if name not in self._names: self._names.append(name) self._dataframes[name] = data_frame self._data_paths[name] = path self.__setattr__(name, data_frame)
[docs] def get_path(self, key, default=None): """ Get path, given key. Parameters ---------- key : str Name for the data. Returns ------- path : str Path to the data. """ if key not in self._names: return default return self._data_paths[key]
[docs] def get_frame(self, key, default=None): """ Get frame, given key. Parameters ---------- key : str Name for the data. default The default argument, if the frame does not exist Returns ------- frame : pd.DataFrame The DataFrame. """ if key not in self._names: return default return self._dataframes[key]
[docs] def keys(self): """ Return keys as a list. Returns ------- keys : list A list of keys in the Configuration object. """ return self._names
[docs] def values(self): """ Return values as a list. Returns ------- values : list A list of values in the Configuration object. """ return [self._dataframes[name] for name in self._names]
[docs] def items(self): """ Return items as a list of tuples. Returns ------- items : list of tuples A list of (key, value) tuples in the Configuration object. """ return [(name, self._dataframe[name]) for name in self._names]
def __contains__(self, key): """ Check if DataContainer object contains a given key. Parameters ---------- key : str A key to check in the DataContainer object Returns ------- key_check : bool True if key in DataContainer object, else False """ return key in self._names def __getitem__(self, key): """ Get frame, given key. Parameters ---------- key : str Name for the data. Returns ------- frame : pd.DataFrame The DataFrame. Raises ------ KeyError If the key does not exist. """ return self.get_frame(key) def __len__(self): """ Return the length of the DataContainer names. Returns ------- length : int The length of the container (i.e. number of frames) """ return len(self._names) def __str__(self): """ String representation of the object. Returns ------- container_names : str A comma-separated list of names from the container. """ return ', '.join(self._names) def __add__(self, other): """ Add two DataContainer objects together and return a new DataContainer object with DataFrames common to both DataContainers Raises ------ ValueError If the object being added is not a DataContainer KeyError If there are duplicate keys in the two DataContainers. """ if not isinstance(other, DataContainer): raise ValueError('Object must be `DataContainer`, ' 'not {}.'.format(type(other))) # Make sure there are no duplicate keys common_keys = set(other._names).intersection(self._names) if common_keys: raise KeyError('The key(s) `{}` already exist' 'in the DataContainer.'.format(', '.join(common_keys))) dicts = DataContainer.to_datasets(self) dicts.extend(DataContainer.to_datasets(other)) return DataContainer(dicts) def __iter__(self): """ Iterate through configuration object keys. Yields ------ key A key in the container dictionary """ for key in self.keys(): yield key
[docs] def copy(self, deep=True): """ Create a copy of the DataContainer object Parameters ---------- deep : bool, optional If True, create a deep copy. Defaults to True. """ dataset_list = [] for name in self.keys(): frame = self[name].copy(deep=deep) dataset_dict = {'name': name, 'frame': frame} dataset_list.append(dataset_dict) return DataContainer(dataset_list)