"""
Classes for preprocessing input data in various contexts.
:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:date: 10/25/2017
:organization: ETS
"""
import logging
import re
import warnings
import numpy as np
import pandas as pd
from collections import defaultdict
from os.path import dirname, abspath
from numpy.random import RandomState
from rsmtool.configuration_parser import Configuration
from rsmtool.reader import DataReader
from rsmtool.container import DataContainer
from rsmtool.reporter import Reporter
from rsmtool.transformer import FeatureTransformer
from rsmtool.utils import convert_to_float
from rsmtool.utils import is_built_in_model, is_skll_model
[docs]class FeatureSubsetProcessor:
"""
Encapsulate feature sub-setting methods.
"""
[docs] @classmethod
def select_by_subset(cls, feature_columns, feature_subset_specs, subset):
"""
Select feature columns using feature subset specs.
Parameters
----------
feature_columns : list
A list of feature columns
feature_subset_specs : pd.DataFrame
The feature subset spec DataFrame.
subset : str
The column to subset.
Returns
-------
feature_names : list
A list of feature names to include.
"""
feature_subset = feature_subset_specs[feature_subset_specs[subset] == 1]['Feature']
feature_names = [feature for feature in feature_columns
if feature in feature_subset.values]
# check whether there are any features in the data file and raise warning
if len(feature_columns) != len(feature_names):
feature_subset_specs_set = set(feature_subset_specs['Feature'])
extra_columns = set(feature_columns).difference(feature_subset_specs_set)
if extra_columns:
logging.warning("No subset information was available for the "
"following columns in the input file. These "
"columns will not be used in the model: "
"{}".format(', '.join(extra_columns)))
if len(feature_subset) != len(feature_names):
extra_subset_features = set(feature_subset).difference(set(feature_names))
if extra_subset_features:
logging.warning("The following features were included into the {} "
"subset in the feature_subset_file but were not "
"specified in the input data: "
"{}".format(subset, ', '.join(extra_subset_features)))
return feature_names
[docs] @classmethod
def check_feature_subset_file(cls, df, subset=None, sign=None):
"""
Check that the file is in the correct format and contains all
the requested values. Raises an exception if it finds any errors
but otherwise returns nothing.
Parameters
----------
df : pd.DataFrame
The feature subset file DataFrame.
subset : str, optional
Name of a pre-defined feature subset.
Defaults to None.
sign : str, optional
Value of the sign
Defaults to None.
Raises
------
ValueError
If any columns are missing from the subset file
or if any of the columns contain invalid values.
"""
# we want to allow title-cased names of columns for historical reasons
# e.g., `Feature` instead of `feature` etc.
df_feature_specs = df.copy()
if ('feature' not in df_feature_specs and
'Feature' not in df_feature_specs):
raise ValueError("The feature_subset_file must contain "
"a column named 'feature' "
"containing the feature names.")
if subset:
if subset not in df_feature_specs:
raise ValueError("Unknown value for feature_subset: {}".format(subset))
if not df_feature_specs[subset].isin([0, 1]).all():
raise ValueError("The subset columns in feature "
"file can only contain 0 or 1")
if sign:
if ('sign_{}'.format(sign) not in df_feature_specs and
'Sign_{}'.format(sign) not in df_feature_specs):
raise ValueError("The feature_subset_file must "
"contain the requested "
"sign column 'sign_{}'".format(sign))
if not df_feature_specs[subset].isin(['-', '+']).all():
raise ValueError("The sign columns in feature "
"file can only contain - or +")
[docs]class FeatureSpecsProcessor:
"""
Encapsulate feature file processing methods.
"""
[docs] @classmethod
def normalize_and_validate_json(cls, feature_json):
"""
Normalize the field names in `feature_json` in order to maintain
backwards compatibility with old config files.
Parameters
----------
feature_json : dict
JSON object containing the information
specified in the feature file, possibly
containing the old-style names for feature
fields.
Returns
-------
new_feature_json : dict
JSON object with all old style names normalized to
new style names.
Raises
------
KeyError
If required fields are missing in the feature JSON file.
"""
warnings.warn("""The ``normalize_and_validate_json`` method is deprecated """
"""and will be removed in the next release. Users can no longer """
"""specify JSON feature files for the RSMTool experiments.""",
category=DeprecationWarning)
field_mapping = {'wt': 'sign',
'featN': 'feature',
'trans': 'transform'}
required_fields = ['feature', 'sign', 'transform']
new_feature_json = defaultdict(list)
feature_list = (feature_json['features'] if 'features' in feature_json
else feature_json['feats'])
for feature_dict in feature_list:
new_feature_dict = {}
for field in feature_dict:
norm_field = (field_mapping[field] if field in field_mapping
else field)
new_feature_dict[norm_field] = feature_dict[field]
new_feature_keys = new_feature_dict.keys()
missing_fields = set(required_fields).difference(new_feature_keys)
if missing_fields:
raise KeyError("The feature file does not "
"contain the following fields: "
"{}".format(','.join(missing_fields)))
new_feature_json['features'].append(new_feature_dict)
return new_feature_json
[docs] @classmethod
def generate_default_specs(cls, feature_names):
"""
Generate default feature "specifications" for the features
with the given names. The specifications are stored as a data frame with
three columns "feature", "transform", and "sign".
Parameters
----------
feature_names: list
List of feature names for which to generate specifications.
Returns
-------
feature_specs: pandas DataFrame
A dataframe with feature specifications that can be saved as a
:ref:`feature list file <example_feature_csv>`.
Note
----
Since these are default specifications, the values for the
`transform` column for each feature will be `"raw"` and the value
for the `sign` column will be `1`.
"""
df_feature_specs = pd.DataFrame({'feature': feature_names})
df_feature_specs['transform'] = 'raw'
df_feature_specs['sign'] = 1.0
return df_feature_specs
[docs] @classmethod
def find_feature_sign(cls, feature, sign_dict):
"""
Get the sign from the feature.csv file
Parameters
----------
feature : str
The name of the feature
sign_dict : dict
A dictionary of feature signs.
Returns
-------
feature_sign_numeric : float
The signed feature.
"""
if feature not in sign_dict.keys():
logging.warning("No information about sign is available "
"for feature {}. The feature will be assigned "
"the default positive weight.".format(feature))
feature_sign_numeric = 1.0
else:
feature_sign_string = sign_dict[feature]
feature_sign_numeric = -1.0 if feature_sign_string == '-' else 1.0
return feature_sign_numeric
[docs] @classmethod
def validate_feature_specs(cls, df):
"""
Check the supplied feature specs to make sure that there are no duplicate
feature names and that all columns are in the right format. Add the default values
for `transform` and `sign` if none is supplied
Parameters
----------
df : pd.DataFrame
The feature specification DataFrame to validate.
Returns
------
df_specs_new : pandas DataFrame
A data frame with normalized values
Raises
------
KeyError :
If the data frame does not have a ``feature`` column.
ValueError:
If there are duplicate values in the ``feature`` column
or if the ``sign`` column contains invalid values.
"""
df_specs_org = df
df_specs_new = df_specs_org.copy()
# we allow internally the use of 'Feature' since
# this is the column name in subset_feature_file.
if "Feature" in df_specs_org:
df_specs_new['feature'] = df_specs_org['Feature']
# check that we have a column named `feature`
if 'feature' not in df_specs_new:
raise KeyError("The feature file must contain a "
"column named 'feature'")
# check to make sure that there are no duplicate feature names
feature_name_count = df_specs_new['feature'].value_counts()
duplicate_features = feature_name_count[feature_name_count > 1]
if len(duplicate_features) > 0:
raise ValueError("The following feature names "
" are duplicated in the feature "
"file: {}".format(duplicate_features.index))
# if we have `sign` column, check that it can be converted to float
if 'sign' in df_specs_new:
try:
df_specs_new['sign'] = df_specs_new['sign'].astype(float)
assert np.all(df_specs_new['sign'].isin([-1, 1]))
except (ValueError, AssertionError):
raise ValueError("The `sign` column in the feature"
"file can only contain '1' or '-1'")
else:
df_specs_new['sign'] = 1
if 'transform' not in df_specs_new:
df_specs_new['transform'] = 'raw'
df_specs_new = df_specs_new[['feature', 'sign', 'transform']]
return df_specs_new
[docs] @classmethod
def generate_specs(cls,
df,
feature_names,
train_label,
feature_subset=None,
feature_sign=None):
"""
Generate feature specifications using the features.csv
for sign and the correlation with score to identify
the best transformation.
Parameters
----------
df : pd.DataFrame
The DataFrame form which to generate specs.
feature_names : list
A list of feature names.
train_label : str
The label column for the training data
feature_subset : pd.DataFrame, optional
A feature_subset_specs DataFrame
feature_sign : int, optional
The sign of the feature.
Returns
-------
df_feature_specs : pd.DataFrame
A feature specifications DataFrame
"""
# get feature sign info if available
if feature_sign:
# Convert to dictionary {feature:sign}
sign_dict = dict(zip(feature_subset.Feature,
feature_subset['Sign_{}'.format(feature_sign)]))
# else create an empty dictionary
else:
sign_dict = {}
feature_specs = []
feature_dict = {}
for feature in feature_names:
feature_dict['feature'] = feature
feature_dict['transform'] = FeatureTransformer.find_feature_transform(feature,
df[feature],
df[train_label])
feature_dict['sign'] = FeatureSpecsProcessor.find_feature_sign(feature, sign_dict)
# Change the sign for inverse and addOneInv transformations
if feature_dict['transform'] in ['inv', 'addOneInv']:
feature_dict['sign'] = feature_dict['sign'] * -1
feature_specs.append(feature_dict)
feature_dict = {}
df_feature_specs = pd.DataFrame(feature_specs)
return df_feature_specs
[docs]class FeaturePreprocessor:
"""
A class to pre-process training and testing features.
"""
[docs] @staticmethod
def check_model_name(model_name):
"""
Check that the given model name is valid and determine its type.
Parameters
----------
model_name : str
Name of the model.
Returns
-------
model_type: str
One of `BUILTIN` or `SKLL`.
Raises
------
ValueError
If the model is not supported.
"""
if is_built_in_model(model_name):
model_type = 'BUILTIN'
elif is_skll_model(model_name):
model_type = 'SKLL'
else:
raise ValueError("The specified model {} "
"was not found. Please "
"check the spelling.".format(model_name))
return model_type
[docs] @staticmethod
def trim(values,
trim_min,
trim_max,
tolerance=0.49998):
"""
Trim the values contained in the given numpy array to
`trim_min` - `tolerance` as the floor and
`trim_max` + `tolerance` as the ceiling.
Parameters
----------
values : list or np.array
The values to trim.
trim_min : float
The lowest score on the score point, used for
trimming the raw regression predictions.
trim_max : float
The highest score on the score point, used for
trimming the raw regression predictions.
tolerance : float, optional
The tolerance that will be used to compute the
trim interval. Defaults to 0.49998.
Returns
-------
trimmed_values : np.array
Trimmed values.
"""
if isinstance(values, list):
values = np.array(values)
new_max = trim_max + tolerance
new_min = trim_min - tolerance
trimmed_values = values.copy()
trimmed_values[trimmed_values > new_max] = new_max
trimmed_values[trimmed_values < new_min] = new_min
return trimmed_values
[docs] @staticmethod
def remove_outliers(values,
mean=None,
sd=None,
sd_multiplier=4):
"""
Clamp any values in the given numpy array that are
+/- `sd_multiplier` (:math:`m`) standard deviations (:math:`\\sigma`)
away from the mean (:math:`\\mu`). Use given `mean` and `sd` instead
of computing :math:`\\sigma` and :math:`\\mu`, if specified.
The values are clamped to the interval .. math::
[\\mu - m * \\sigma, \\mu + m * \\sigma]
Parameters
----------
values : np.array
The values from which to remove outliers.
mean : int or float, optional
Use the given mean value when computing outliers
instead of the mean from the data.
Defaults to None
sd : None, optional
Use the given std. dev. value when computing
outliers instead of the std. dev. from the
data.
Defaults to None.
sd_multiplier : int, optional
Use the given multipler for the std. dev. when
computing the outliers. Defaults to 4.
Defaults to 4.
Returns
-------
new_values : np.array
Numpy array with the outliers clamped.
"""
# convert data to a numpy float array before doing any clamping
new_values = np.array(values, dtype=np.float)
if not mean:
mean = new_values.mean()
if not sd:
sd = new_values.std()
floor = mean - sd_multiplier * sd
ceiling = mean + sd_multiplier * sd
new_values[new_values > ceiling] = ceiling
new_values[new_values < floor] = floor
return new_values
[docs] @staticmethod
def select_candidates(df,
N,
candidate_col='candidate'):
"""
Only select candidates which have responses to N or more items.
Parameters
----------
df : pd.DataFrame
The DataFrame from which to select candidates with N or more items.
N: int
minimal number of items per candidate
candidate_col : str, optional
name of the column which contains candidate ids.
Defaults to 'candidate'.
Returns
-------
df_included: pandas DataFrame
Data frame with responses from candidates with responses to N
or more items
df_excluded: pandas DataFrame
Data frame with responses from candidates with responses to
less than N items
"""
items_per_candidate = df[candidate_col].value_counts()
selected_candidates = items_per_candidate[items_per_candidate >= N]
selected_candidates = selected_candidates.index
df_included = df[df[candidate_col].isin(selected_candidates)].copy()
df_excluded = df[~df[candidate_col].isin(selected_candidates)].copy()
# reset indices
df_included.reset_index(drop=True, inplace=True)
df_excluded.reset_index(drop=True, inplace=True)
return (df_included,
df_excluded)
[docs] @staticmethod
def check_subgroups(df, subgroups):
"""
Check that all subgroups, if specified, correspond to columns in the
provided data frame, and replace all NaNs in subgroups values with
'No info' for later convenience. Raises an exception if any specified
subgroup columns are missing.
Parameters
----------
df : pd.DataFrame
DataFrame with subgroups to check.
subgroups : list of str
List of column names that contain grouping
information.
Returns
-------
df : pandas DataFrame
Modified input data frame with NaNs replaced.
Raises
------
KeyError
If the data does not contain columns for all subgroups
"""
missing_sub_cols = set(subgroups).difference(df.columns)
if missing_sub_cols:
raise KeyError("The data does not contain columns "
"for all subgroups specified in the "
"configuration file. Please check for "
"capitalization and other spelling "
"errors and make sure the subgroup "
"names do not contain hyphens. "
"The data does not have columns "
"for the following "
"subgroups: {}".format(', '.join(missing_sub_cols)))
# replace any empty values in subgroups values by "No info"
empty_value = re.compile(r"^\s*$")
df[subgroups] = df[subgroups].replace(to_replace=empty_value,
value='No info')
return df
[docs] @staticmethod
def rename_default_columns(df,
requested_feature_names,
id_column,
first_human_score_column,
second_human_score_column,
length_column,
system_score_column,
candidate_column):
"""
Standardize all column names and rename all columns with default
names to ##NAME##.
Parameters
----------
df : pd.DataFrame
The DataFrame whose columns to rename.
requested_feature_names : list
List of feature column names that we want
to include in the scoring model.
id_column : str
Column name containing the response IDs.
first_human_score_column : str or None
Column name containing the H1 scores.
second_human_score_column : str or None
Column name containing the H2 scores.
Should be None if no H2 scores are available.
length_column : str or None
Column name containing response lengths.
Should be None if lengths are not available.
system_score_column : str
Column name containing the score predicted
by the system. This is only used for RSMEval.
candidate_column : str or None
Column name containing identifying information
at the candidate level. Should be None if such
information is not available.
Returns
-------
df : pandas DataFrame
Modified input data frame with all the approximate
re-namings.
"""
df = df.copy()
columns = [id_column,
first_human_score_column,
second_human_score_column,
length_column,
system_score_column,
candidate_column]
defaults = ['spkitemid', 'sc1', 'sc2', 'length', 'raw', 'candidate']
# create a dictionary of name mapping for used columns
name_mapping = dict(filter(lambda t: t[0] is not None, zip(columns,
defaults)))
# find the columns where the names match the default names
correct_defaults = [column for (column, default)
in name_mapping.items()
if column == default]
# find the columns with default names reserved for other columns
# which are not used as features in the model
columns_with_incorrect_default_names = [column for column in df.columns
if (column in defaults and
column not in correct_defaults and
column not in requested_feature_names)]
# rename these columns
if columns_with_incorrect_default_names:
new_column_names = ['##{}##'.format(column) for column
in columns_with_incorrect_default_names]
df.rename(columns=dict(zip(columns_with_incorrect_default_names,
new_column_names)),
inplace=True)
# find the columns where the names do not match the default
columns_with_custom_names = [column for column in name_mapping
if column not in correct_defaults]
# rename the custom-named columns to default values
for column in columns_with_custom_names:
# if the column has already been renamed because it used a
# default name, then use the updated name
if column in columns_with_incorrect_default_names:
df.rename(columns={'##{}##'.format(column):
name_mapping[column]},
inplace=True)
else:
df.rename(columns={column:
name_mapping[column]},
inplace=True)
return df
[docs] @staticmethod
def filter_on_column(df,
column,
id_column,
exclude_zeros=False,
exclude_zero_sd=False):
"""
Filter out the rows in the given data frame that contain non-numeric
(or zero, if specified) values in the specified column. Additionally,
it may exclude any columns if they have a standard deviation
(:math:`\\sigma`) of 0.
Parameters
----------
df : pd.DataFrame
The DataFrame to filter on.
column : str
Name of the column from which to filter out values.
id_column : str
Name of the column containing the unique response IDs.
exclude_zeros : bool, optional
Whether to exclude responses containing zeros
in the specified column. Defaults to `False`.
exclude_zero_sd : bool, optional
Whether to perform the additional filtering step of removing
columns that have :math:`\\sigma = 0`. Defaults to `False`.
Returns
-------
df_filtered : pandas DataFrame
Data frame containing the responses that were *not* filtered out.
df_excluded : pandas DataFrame
Data frame containing the non-numeric or zero responses that
were filtered out.
Note
----
The columns with :math:`\\sigma=0` are removed from both output
data frames.
"""
# create a copy of the original data frame
df_filter = df.copy()
# return a copy of the original data frame if
# the given column does not exist at all
if column not in df.columns:
return df_filter
# Force convert the label column to numeric and
# convert whatever can't be converted to a NaN
df_filter[column] = pd.to_numeric(df_filter[column],
errors='coerce').astype(float)
# Save the values that have been converted to NaNs
# as a separate data frame. We want to keep them as NaNs
# to do more analyses later.
# We also filter out inf values. Since these can only be generated
# during transformations we convert them to NaNs for consistency.
bad_rows = df_filter[df_filter[column].isnull() | np.isinf(df_filter[column])]
# drop the NaNs that we might have gotten
df_filter = df_filter[df_filter[column].notnull() & ~np.isinf(df_filter[column])]
# exclude zeros if specified
if exclude_zeros:
zero_indices = df_filter[df_filter[column] == 0].index.values
zero_rows = df.loc[zero_indices]
df_filter = df_filter[df_filter[column] != 0]
else:
zero_rows = pd.DataFrame()
# combine all the filtered rows into a single data frame
df_exclude = pd.concat([bad_rows, zero_rows])
# reset the index so that the indexing works correctly
# for the next feature with missing values
df_filter.reset_index(drop=True, inplace=True)
df_exclude.reset_index(drop=True, inplace=True)
# Drop this column if the standard deviation equals zero:
# for training set sd == 0 will break normalization.
# We set the tolerance level to the 6th digit
# to account for a possibility that the exact value
# computed by std is not 0
if exclude_zero_sd is True:
feature_sd = df_filter[column].std()
if np.isclose(feature_sd, 0, atol=1e-07):
logging.info("Feature {} was excluded from the model"
" because its standard deviation in the "
"training set is equal to 0.".format(column))
df_filter = df_filter.drop(column, 1)
df_exclude = df_exclude.drop(column, 1)
# return the filtered rows and the new data frame
return (df_filter, df_exclude)
[docs] @staticmethod
def process_predictions(df_test_predictions,
train_predictions_mean,
train_predictions_sd,
human_labels_mean,
human_labels_sd,
trim_min,
trim_max):
"""
Process predictions to create scaled, trimmed
and rounded predictions.
Parameters
----------
df_test_predictions : pd.DataFrame
Data frame containing the test set predictions.
train_predictions_mean : float
The mean of the predictions on the training set.
train_predictions_sd : float
The std. dev. of the predictions on the training
set.
human_labels_mean : float
The mean of the human scores used to train the
model.
human_labels_sd : float
The std. dev. of the human scores used to train
the model.
trim_min : float
The lowest score on the score point, used for
trimming the raw regression predictions.
trim_max : float
The highest score on the score point, used for
trimming the raw regression predictions.
Returns
-------
df_pred_processed : pd.DataFrame
Data frame containing the various trimmed
and rounded predictions.
"""
# rescale the test set predictions by boosting
# them to match the human mean and SD
scaled_test_predictions = (df_test_predictions['raw'] -
train_predictions_mean) / train_predictions_sd
scaled_test_predictions = scaled_test_predictions * human_labels_sd + human_labels_mean
df_pred_process = df_test_predictions.copy()
df_pred_process['scale'] = scaled_test_predictions
# trim and round the predictions before running the analyses
df_pred_process['raw_trim'] = FeaturePreprocessor.trim(df_pred_process['raw'],
trim_min,
trim_max)
df_pred_process['raw_trim_round'] = np.rint(df_pred_process['raw_trim'])
df_pred_process['raw_trim_round'] = df_pred_process['raw_trim_round'].astype('int64')
df_pred_process['scale_trim'] = FeaturePreprocessor.trim(df_pred_process['scale'],
trim_min,
trim_max)
df_pred_process['scale_trim_round'] = np.rint(df_pred_process['scale_trim'])
df_pred_process['scale_trim_round'] = df_pred_process['scale_trim_round'].astype('int64')
return df_pred_process
[docs] def filter_on_flag_columns(self,
df,
flag_column_dict):
"""
Check that all flag_columns are present in the given
data frame, convert these columns to strings and filter
out the values which do not match the condition in
`flag_column_dict`.
Parameters
----------
df : pd.DataFrame
The DataFrame to filter on.
flag_column_dict : dict
Dictionary containing the flag column
information.
Returns
-------
df_responses_with_requested_flags : pandas DataFrame
Data frame containing the responses remaining
after filtering using the specified flag
columns.
df_responses_with_excluded_flags : pandas DataFrame
Data frame containing the responses filtered
out using the specified flag columns.
Raises
------
KeyError
If the columns listed in the dictionary are
not actually present in the data frame.
ValueError
If no responses remain after filtering based
on the flag column information.
"""
df = df.copy()
flag_columns = list(flag_column_dict.keys())
if not flag_columns:
return df.copy(), pd.DataFrame(columns=df.columns)
else:
# check that all columns are present
missing_flag_columns = set(flag_columns).difference(df.columns)
if missing_flag_columns:
raise KeyError("The data does not contain columns "
"for all flag columns specified in the "
"configuration file. Please check for "
"capitalization and other spelling "
"errors and make sure the flag column "
"names do not contain hyphens. "
"The data does not have the following columns: "
"{}".format(', '.join(missing_flag_columns)))
# since flag column may be a mix of strings and numeric values
# we convert all strings and integers to floats such that, for
# example, “1”, 1, and “1.0" all map to 1.0. To do this, we will
# first convert all the strings to numbers and then convert
# all the integers to floats.
flag_column_dict_to_float = {key: list(map(convert_to_float, value))
for (key, value)
in flag_column_dict.items()}
# and now convert the the values in the feature column
# in the data frame
df_new = df[flag_columns].copy()
df_new = df_new.applymap(convert_to_float)
# identify responses with values which satisfy the condition
full_mask = df_new.isin(flag_column_dict_to_float)
flag_mask = full_mask[list(flag_column_dict_to_float.keys())].all(1)
# return the columns from the original frame that was passed in
# so that all data types remain the same and are not changed
df_responses_with_requested_flags = df[flag_mask].copy()
df_responses_with_excluded_flags = df[~flag_mask].copy()
# make sure that the remaining data frame is not empty
if len(df_responses_with_requested_flags) == 0:
raise ValueError("No responses remaining after filtering "
"on flag columns. No further analysis can "
"be run.")
# reset the index
df_responses_with_requested_flags.reset_index(drop=True,
inplace=True)
df_responses_with_excluded_flags.reset_index(drop=True,
inplace=True)
return (df_responses_with_requested_flags,
df_responses_with_excluded_flags)
[docs] def generate_feature_names(self,
df,
reserved_column_names,
feature_subset_specs,
feature_subset):
"""
Generate the feature names from the column
names of the given data frame and select the
specified subset of features.
Parameters
----------
df : pd.DataFrame
The DataFrame from which to generate feature names.
reserved_column_names : list
Names of reserved columns.
feature_subset_specs : pd.DataFrame
Feature subset specs
feature_subset : str
Feature subset column.
Returns
-------
feautre_names : list
A list of features names.
"""
df = df.copy()
# Exclude the reserved names
possible_feature_names = [cname for cname in df.columns
if cname not in reserved_column_names]
# Select the features by subset.
# In the future, we may add option to select
# by other methods, if needed.
if feature_subset is not None:
feature_names = FeatureSubsetProcessor.select_by_subset(possible_feature_names,
feature_subset_specs,
feature_subset)
else:
feature_names = possible_feature_names
return feature_names
[docs] def preprocess_feature(self,
values,
feature_name,
feature_transform,
feature_mean,
feature_sd,
exclude_zero_sd=False,
raise_error=True):
"""
Remove outliers and transform the values in the given numpy array
using the given outlier and transformation parameters. The values
are assumed for the given feature name.
Parameters
----------
values : np.array
The feature values to preprocess
feature_name : str
Name of the feature being pre-processed.
feature_transform : str
Name of the transformation function to apply.
feature_mean : float
Mean value to use for outlier detection instead
of the mean of the given feature values.
feature_sd : float
Std. dev. value to use for outlier detection instead
of the std. dev. of the given feature values.
exclude_zero_sd : bool, optional
Check `data` has a zero
std. dev.
Defaults to False.
raise_error : bool, optional
Raise error if any of the transformations lead to inf values
or may change the ranking of feature values.
Defaults to True
Returns
-------
transformed_feature : numpy array
Numpy array containing the transformed and clamped
feature values.
Raises
------
ValueError
If the given values have zero standard deviation and
`exclude_zero_sd` is set to `True`.
"""
# clamp any outlier values that are 4 standard deviations
# away from the mean
features_no_outliers = self.remove_outliers(values,
mean=feature_mean,
sd=feature_sd)
# apply the requested transformation to the feature
transformed_feature = FeatureTransformer.transform_feature(features_no_outliers,
feature_name,
feature_transform,
raise_error=raise_error)
# check the standard deviation of the transformed feature
# we set ddof to 1 so that np.std gave the same result as pandas .std
# we also set the tolerance limit to account for cases where std
# is computed as a very low decimal rather than 0
# We only do this for the training set.
if exclude_zero_sd:
feature_sd = np.std(transformed_feature, ddof=1)
if np.isclose(feature_sd, 0, atol=1e-07):
raise ValueError("The standard deviation for "
"feature {} is 0 after pre-processing. "
"Please exclude this feature and re-run "
"the experiment.".format(feature_name))
return transformed_feature
[docs] def preprocess_features(self,
df_train,
df_test,
df_feature_specs,
standardize_features=True):
"""
Pre-process those features in the given training and testing
data frame `df` whose specifications are contained in
`feature_specs`. Also return a third data frame containing the
feature specs themselves.
Parameters
----------
df_train : pandas DataFrame
Data frame containing the raw feature values
for the training set.
df_test : pandas DataFrame
Data frame containing the raw feature values
for the test set.
df_feature_specs : pandas DataFrame
Data frame containing the various specifications
from the feature file.
standardize_features : bool
Whether to standardize the features
Defaults to True.
Returns
-------
df_train_preprocessed : pd.DataFrame
DataFrame with preprocessed training data
df_test_preprocessed : pd.DataFrame
DataFrame with preprocessed test data
df_feature_info : pd.DataFrame
DataFrame with feature information
"""
# keep the original data frames and make copies
# that only include features used in the model
df_train_preprocessed = df_train.copy()
df_test_preprocessed = df_test.copy()
# we also need to create a data frame that includes
# all relevant information about each feature
df_feature_info = pd.DataFrame()
# make feature the index of df_feature_specs
df_feature_specs.index = df_feature_specs['feature']
# now iterate over each feature
for feature_name in df_feature_specs['feature']:
feature_transformation = df_feature_specs.at[feature_name, 'transform']
feature_sign = df_feature_specs.at[feature_name, 'sign']
train_feature_mean = df_train[feature_name].mean()
train_feature_sd = df_train[feature_name].std()
training_feature_values = df_train[feature_name].values
df_train_preprocessed[feature_name] = self.preprocess_feature(training_feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd,
exclude_zero_sd=True)
testing_feature_values = df_test[feature_name].values
df_test_preprocessed[feature_name] = self.preprocess_feature(testing_feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd)
# Standardize the features using the mean and sd computed on the
# training set. These are computed separately because we need to
# get the mean of transformed feature before standardization.
train_transformed_mean = df_train_preprocessed[feature_name].mean()
train_transformed_sd = df_train_preprocessed[feature_name].std()
if standardize_features:
df_train_without_mean = (df_train_preprocessed[feature_name] -
train_transformed_mean)
df_train_preprocessed[feature_name] = df_train_without_mean / train_transformed_sd
df_test_without_mean = (df_test_preprocessed[feature_name] -
train_transformed_mean)
df_test_preprocessed[feature_name] = df_test_without_mean / train_transformed_sd
# Multiply both train and test feature by sign.
df_train_preprocessed[feature_name] = (df_train_preprocessed[feature_name] *
feature_sign)
df_test_preprocessed[feature_name] = (df_test_preprocessed[feature_name] *
feature_sign)
# update the feature preprocessing metadata frame
df_feature = pd.DataFrame([{"feature": feature_name,
"transform": feature_transformation,
"sign": feature_sign,
"train_mean": train_feature_mean,
"train_sd": train_feature_sd,
"train_transformed_mean": train_transformed_mean,
"train_transformed_sd": train_transformed_sd}])
df_feature_info = df_feature_info.append(df_feature)
# reset the index for the feature metadata frame
# since we built it up row by row
df_feature_info = df_feature_info.reset_index().drop('index', 1)
# return the three data frames
return (df_train_preprocessed,
df_test_preprocessed,
df_feature_info)
[docs] def filter_data(self,
df,
label_column,
id_column,
length_column,
second_human_score_column,
candidate_column,
requested_feature_names,
reserved_column_names,
given_trim_min,
given_trim_max,
flag_column_dict,
subgroups,
exclude_zero_scores=True,
exclude_zero_sd=False,
feature_subset_specs=None,
feature_subset=None,
min_candidate_items=None,
use_fake_labels=False):
"""
Filter the data to remove rows that have zero/non-numeric values
for `label_column`. If feature_names are specified, check whether any
features that are specifically requested in `feature_names`
are missing from the data. If no feature_names are specified,
these are generated based on column names and subset information,
if available. The function then excludes non-numeric values for
any feature. If the user requested to exclude candidates with less
than min_items_per_candidates, such candidates are excluded.
It also generates fake labels between 1 and 10 if
`use_fake_parameters` is set to True. Finally, it renames the id
and label column and splits the data into the data frame with
feature values and score label, the data frame with information about
subgroup and candidate (metadata) and the data frame with all other
columns.
Parameters
----------
df : pd.DataFrame
The DataFrame to filter.
label_column : str
The label column in the data.
id_column : str
The ID column in the data.
length_column : str
The length column in the data.
second_human_score_column : str
The second human score column in the data.
candidate_column : str
The candidate column in the data.
requested_feature_names : list
A list of requested feature names.
reserved_column_names : list
A list of reserved column names.
given_trim_min : int
The minimum trim value.
given_trim_max : int
The maximum trim value.
flag_column_dict : dict
A dictionary of flag columns.
subgroups : list, optional
A list of subgroups, if any.
exclude_zero_scores : bool
Whether to exclude zero scores.
Defaults to True.
exclude_zero_sd : bool, optional
Whether to exclude zero standard deviation.
Defaults to False.
feature_subset_specs : pd.DataFrame, optional
The feature_subset_specs DataFrame
Defaults to None.
feature_subset : str, optional
The feature subset group (e.g. 'A').
Defaults to None.
min_candidate_items : int, optional
The minimum number of items needed to include candidate.
Defaults to None
use_fake_labels : bool, optional
Whether to use fake labels.
Defaults to None.
Returns
-------
df_filtered_features : pd.DataFrame
DataFrame with filtered features
df_filtered_metadata : pd.DataFrame
DataFrame with filtered metadata
df_filtered_other_columns : pd.DataFrame
DataFrame with other columns filtered
df_excluded : pd.DataFrame
DataFrame with excluded records
df_filtered_length : pd.DataFrame
DataFrame with length column(s) filtered
df_filtered_human_scores : pd.DataFrame
DataFrame with human scores filtered
df_responses_with_excluded_flags : pd.DataFrame
A DataFrame containing responses with excluded flags
trim_min : float
The maximum trim value
trim_max : float
The minimum trim value
feature_names : list
A list of feature names
"""
# make sure that the columns specified in the
# config file actually exist
columns_to_check = [id_column, label_column]
if length_column:
columns_to_check.append(length_column)
if second_human_score_column:
columns_to_check.append(second_human_score_column)
if candidate_column:
columns_to_check.append(candidate_column)
missing_columns = set(columns_to_check).difference(df.columns)
if missing_columns:
raise KeyError("Columns {} from the config file "
"do not exist in the data.".format(missing_columns))
# it is possible for the `id_column` and `candidate_column` to be
# set to the same column name in the CSV file, e.g., if there is
# only one response per candidate. If this happens, we neeed to
# create a duplicate column for candidates or id for the downstream
# processing to work as usual.
if id_column == candidate_column:
# if the name for both columns is `candidate`, we need to
# create a separate id_column name
if id_column == 'candidate':
df['spkitemid'] = df['candidate'].copy()
id_column = 'spkitemid'
# else we create a separate `candidate` column
else:
df['candidate'] = df[id_column].copy()
candidate_column = 'candidate'
df = self.rename_default_columns(df,
requested_feature_names,
id_column,
label_column,
second_human_score_column,
length_column,
None,
candidate_column)
# check that the id_column contains unique values
if df['spkitemid'].size != df['spkitemid'].unique().size:
raise ValueError("The data contains duplicate response IDs in "
"'{}'. Please make sure all response IDs are "
"unique and re-run the tool.".format(id_column))
# Generate feature names if no specific features were requested by the user
if len(requested_feature_names) == 0:
feature_names = self.generate_feature_names(df,
reserved_column_names,
feature_subset_specs=feature_subset_specs,
feature_subset=feature_subset)
else:
feature_names = requested_feature_names
# make sure that feature names do not contain reserved column names
illegal_feature_names = set(feature_names).intersection(reserved_column_names)
if illegal_feature_names:
raise ValueError("The following reserved "
"column names cannot be "
"used as feature names: '{}'. "
"Please rename these columns "
"and re-run the "
"experiment.".format(', '.join(illegal_feature_names)))
# check to make sure that the subgroup columns are all present
df = FeaturePreprocessor.check_subgroups(df, subgroups)
# filter out the responses based on flag columns
(df_responses_with_requested_flags,
df_responses_with_excluded_flags) = self.filter_on_flag_columns(df, flag_column_dict)
# filter out the rows that have non-numeric or zero labels
# unless we are going to generate fake labels in the first place
if not use_fake_labels:
(df_filtered,
df_excluded) = self.filter_on_column(df_responses_with_requested_flags,
'sc1',
'spkitemid',
exclude_zeros=exclude_zero_scores)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError("No responses remaining after filtering out "
"non-numeric human scores. No further analysis "
"can be run. ")
trim_min = given_trim_min if given_trim_min else df_filtered['sc1'].min()
trim_max = given_trim_max if given_trim_max else df_filtered['sc1'].max()
else:
df_filtered = df_responses_with_requested_flags.copy()
trim_min = given_trim_min if given_trim_min else 1
trim_max = given_trim_max if given_trim_max else 10
logging.info("Generating labels randomly "
"from [{}, {}]".format(trim_min, trim_max))
randgen = RandomState(seed=1234567890)
df_filtered[label_column] = randgen.random_integers(trim_min,
trim_max,
size=len(df_filtered))
# make sure there are no missing features in the data
missing_features = set(feature_names).difference(df_filtered.columns)
if not missing_features:
# make sure all features selected for model building are numeric
# and also replace any non-numeric feature values in already
# excluded data with NaNs for consistency
for feat in feature_names:
df_excluded[feat] = pd.to_numeric(df_excluded[feat],
errors='coerce').astype(float)
newdf, newdf_excluded = self.filter_on_column(df_filtered,
feat,
'spkitemid',
exclude_zeros=False,
exclude_zero_sd=exclude_zero_sd)
del df_filtered
df_filtered = newdf
with np.errstate(divide='ignore'):
df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError("No responses remaining after filtering "
"out non-numeric feature values. No further "
"analysis can be run.")
# Raise warning if we excluded features that were
# specified in the .json file because sd == 0.
omitted_features = set(requested_feature_names).difference(df_filtered.columns)
if omitted_features:
logging.warning("The following requested features "
"were excluded because their standard "
"deviation on the training set was 0: {}.\n"
"Please edit the feature file to exclude "
"these features and re-run the "
"tool".format(', '.join(omitted_features)))
# Update the feature names
feature_names = [feature for feature in feature_names
if feature in df_filtered]
else:
raise KeyError("DataFrame does not contain "
"columns for all features specified in "
"the feature file. Please check for "
"capitalization and other spelling "
"errors and make sure the feature "
"names do not contain hyphens. "
"The data does not have columns "
"for the following features: "
"{}".format(', '.join(missing_features)))
# if ``length_column`` exists, make sure it's converted to numeric;
# values that cannot be coerced to numeric will be set to ``np.nan``
if length_column:
df_filtered['length'] = pd.to_numeric(df_filtered['length'], errors='coerce')
# check the values for length column. We do this after filtering
# to make sure we have removed responses that have not been
# processed correctly. Else rename length column to
# ##ORIGINAL_NAME##.
if (length_column and
(len(df_filtered[df_filtered['length'].isnull()]) != 0 or
df_filtered['length'].std() <= 0)):
logging.warning("The {} column either has missing values or a standard"
" deviation <= 0. No length-based analysis will be"
" provided. The column will be renamed as ##{}## and"
" saved in *train_other_columns.csv.".format(length_column,
length_column))
df_filtered.rename(columns={'length': '##{}##'.format(length_column)},
inplace=True)
# if requested, exclude the candidates with less than X responses
# left after filtering
if min_candidate_items:
(df_filtered_candidates,
df_excluded_candidates) = FeaturePreprocessor.select_candidates(df_filtered,
min_candidate_items)
# check that there are still responses left for analysis
if len(df_filtered_candidates) == 0:
raise ValueError("After filtering non-numeric scores and "
"non-numeric feature values there were "
"no candidates with {} or more responses "
"left for analysis".format(min_candidate_items))
# redefine df_filtered
df_filtered = df_filtered_candidates.copy()
# update df_excluded
df_excluded = pd.concat([df_excluded, df_excluded_candidates])
# create separate data-frames for features and sc1, all other
# information, and responses excluded during filtering
not_other_columns = set()
feature_columns = ['spkitemid', 'sc1'] + feature_names
df_filtered_features = df_filtered[feature_columns]
not_other_columns.update(feature_columns)
metadata_columns = ['spkitemid'] + subgroups
if candidate_column:
metadata_columns.append('candidate')
df_filtered_metadata = df_filtered[metadata_columns]
not_other_columns.update(metadata_columns)
df_filtered_length = pd.DataFrame()
length_columns = ['spkitemid', 'length']
if length_column and 'length' in df_filtered:
df_filtered_length = df_filtered[length_columns]
not_other_columns.update(length_columns)
df_filtered_human_scores = pd.DataFrame()
human_score_columns = ['spkitemid', 'sc1', 'sc2']
if second_human_score_column and 'sc2' in df_filtered:
df_filtered_human_scores = df_filtered[human_score_columns].copy()
not_other_columns.update(['sc2'])
# filter out any non-numeric value rows
# as well as zeros, if we were asked to
df_filtered_human_scores['sc2'] = pd.to_numeric(df_filtered_human_scores['sc2'],
errors='coerce').astype(float)
if exclude_zero_scores:
df_filtered_human_scores['sc2'] = df_filtered_human_scores['sc2'].replace(0,
np.nan)
# now extract all other columns and add 'spkitemid'
other_columns = ['spkitemid'] + [column for column in df_filtered.columns
if column not in not_other_columns]
df_filtered_other_columns = df_filtered[other_columns]
return (df_filtered_features,
df_filtered_metadata,
df_filtered_other_columns,
df_excluded,
df_filtered_length,
df_filtered_human_scores,
df_responses_with_excluded_flags,
trim_min,
trim_max,
feature_names)
[docs] def process_data_rsmeval(self, config_obj, data_container_obj):
"""
The main function that sets up the experiment by loading the
training and evaluation data sets and preprocessing them. Raises
appropriate exceptions .
Parameters
----------
config_obj : configuration_parser.Configuration
A configuration object.
data_container_obj : container.DataContainer
A data container object.
Returns
-------
config_obj : configuration_parser.Configuration
A new configuration object.
data_congtainer : container.DataContainer
A new data container object.
Raises
------
ValueError
"""
# get the directory where the config file lives
# if this is the 'expm' directory, then go
# up one level.
configpath = dirname(abspath(config_obj.filepath))
pred_file_location = DataReader.locate_files(config_obj['predictions_file'],
configpath)
# get the experiment ID
experiment_id = config_obj['experiment_id']
# get the description
description = config_obj['description']
# get the column name for the labels for the training and testing data
human_score_column = config_obj['human_score_column']
system_score_column = config_obj['system_score_column']
# if the human score column is the same as the
# system score column, raise an error
if human_score_column == system_score_column:
raise ValueError("'human_score_column' and "
"'system_score_column' "
"cannot have the same value.")
# get the name of the optional column that
# contains the second human score
second_human_score_column = config_obj['second_human_score_column']
# if the human score column is the same as the
# second human score column, raise an error
if human_score_column == second_human_score_column:
raise ValueError("'human_score_column' and "
"'second_human_score_column' "
"cannot have the same value.")
# get the column name that will hold the ID for
# both the training and the test data
id_column = config_obj['id_column']
# get the specified trim min and max, if any
# and make sure they are numeric
spec_trim_min, spec_trim_max = config_obj.get_trim_min_max()
# get the subgroups if any
subgroups = config_obj.get('subgroups')
# get the candidate column if any and convert it to string
candidate_column = config_obj['candidate_column']
# check if we are excluding candidates based on number of responses
exclude_listwise = config_obj.check_exclude_listwise()
min_items_per_candidate = config_obj['min_items_per_candidate']
general_report_sections = config_obj['general_sections']
# get any special sections that the user might have specified
special_report_sections = config_obj['special_sections']
# get any custom sections and locate them to make sure
# that they exist, otherwise raise an exception
custom_report_section_paths = config_obj['custom_sections']
if custom_report_section_paths:
logging.info('Locating custom report sections')
custom_report_sections = Reporter.locate_custom_sections(custom_report_section_paths,
configpath)
else:
custom_report_sections = []
section_order = config_obj['section_order']
# check all sections values and order and get the
# ordered list of notebook files
chosen_notebook_files = Reporter().get_ordered_notebook_files(general_report_sections,
special_report_sections,
custom_report_sections,
section_order,
subgroups,
model_type=None,
context='rsmeval')
# are we excluding zero scores?
exclude_zero_scores = config_obj['exclude_zero_scores']
# if we are excluding zero scores but trim_min
# is set to 0, then we need to warn the user
if exclude_zero_scores and spec_trim_min == 0:
logging.warning("'exclude_zero_scores' is set to True but "
" 'trim_min' is set to 0. This may cause "
" unexpected behavior.")
# are we filtering on any other columns?
flag_column_dict = config_obj.check_flag_column()
# do we have the training set predictions and human scores CSV file
scale_with = config_obj.get('scale_with')
# use scaled predictions for the analyses unless
# we were told not to
use_scaled_predictions = (scale_with is not None)
# log an appropriate message
if scale_with is None:
message = ('Assuming given system predictions '
'are unscaled and will be used as such.')
elif scale_with == 'asis':
message = ('Assuming given system predictions '
'are already scaled and will be used as such.')
else:
message = ('Assuming given system predictions '
'are unscaled and will be scaled before use.')
logging.info(message)
df_pred = data_container_obj.predictions
# make sure that the columns specified in the config file actually exist
# make sure that the columns specified in the config file actually exist
columns_to_check = [id_column, human_score_column, system_score_column]
if second_human_score_column:
columns_to_check.append(second_human_score_column)
if candidate_column:
columns_to_check.append(candidate_column)
missing_columns = set(columns_to_check).difference(df_pred.columns)
if missing_columns:
raise KeyError('Columns {} from the config file do not exist '
'in the predictions file.'.format(missing_columns))
df_pred = self.rename_default_columns(df_pred,
[],
id_column,
human_score_column,
second_human_score_column,
None,
system_score_column,
candidate_column)
# check that the id_column contains unique values
if df_pred['spkitemid'].size != df_pred['spkitemid'].unique().size:
raise ValueError("The data contains duplicate response IDs "
"in '{}'. Please make sure all response IDs "
"are unique and re-run the tool.".format(id_column))
df_pred = self.check_subgroups(df_pred, subgroups)
# filter out the responses based on flag columns
(df_responses_with_requested_flags,
df_responses_with_excluded_flags) = self.filter_on_flag_columns(df_pred,
flag_column_dict)
# filter out rows that have non-numeric or zero human scores
df_filtered, df_excluded = self.filter_on_column(df_responses_with_requested_flags,
'sc1',
'spkitemid',
exclude_zeros=exclude_zero_scores)
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError("No responses remaining after filtering out "
"non-numeric human scores. No further analysis "
"can be run. ")
# Change all non-numeric machine scores in excluded
# data to NaNs for consistency with rsmtool.
# NOTE: This will *not* work if *all* of the values
# in column are non-numeric. This is a known bug in
# pandas: https://github.com/pydata/pandas/issues/9589
# Therefore, we need add an additional check after this.
df_excluded['raw'] = pd.to_numeric(df_excluded['raw'], errors='coerce').astype(float)
# filter out the non-numeric machine scores from the rest of the data
newdf, newdf_excluded = self.filter_on_column(df_filtered,
'raw',
'spkitemid',
exclude_zeros=False)
del df_filtered
df_filtered_pred = newdf
# make sure that the remaining data frame is not empty
if len(df_filtered_pred) == 0:
raise ValueError("No responses remaining after filtering out "
"non-numeric machine scores. No further analysis "
"can be run. ")
with np.errstate(divide='ignore'):
df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')
# if requested, exclude the candidates with less than X responses
# left after filtering
if exclude_listwise:
(df_filtered_candidates,
df_excluded_candidates) = self.select_candidates(df_filtered_pred,
min_items_per_candidate)
# check that there are still responses left for analysis
if len(df_filtered_candidates) == 0:
raise ValueError("After filtering non-numeric human and system scores "
"there were "
"no candidates with {} or more responses "
"left for analysis".format(str(min_items_per_candidate)))
# redefine df_filtered_pred
df_filtered_pred = df_filtered_candidates.copy()
# update df_excluded
df_excluded = pd.concat([df_excluded, df_excluded_candidates])
# set default values for scaling
scale_pred_mean = 0
scale_pred_sd = 1
scale_human_mean = 0
scale_human_sd = 1
if data_container_obj.get_frame('scale') is not None:
if ('sc1' not in data_container_obj.scale.columns and
'prediction' not in data_container_obj.scale.columns):
raise KeyError('The CSV file specified for scaling ',
'must have the "prediction" and the "sc1" '
'columns.')
else:
scale_pred_mean, scale_pred_sd = (data_container_obj.scale['prediction'].mean(),
data_container_obj.scale['prediction'].std())
scale_human_mean, scale_human_sd = (data_container_obj.scale['sc1'].mean(),
data_container_obj.scale['sc1'].std())
logging.info('Processing predictions')
df_pred_processed = self.process_predictions(df_filtered_pred,
scale_pred_mean,
scale_pred_sd,
scale_human_mean,
scale_human_sd,
spec_trim_min,
spec_trim_max)
if not scale_with:
expected_score_types = ['raw', 'raw_trim', 'raw_trim_round']
elif scale_with == 'asis':
expected_score_types = ['scale', 'scale_trim', 'scale_trim_round']
else:
expected_score_types = ['raw', 'raw_trim', 'raw_trim_round',
'scale', 'scale_trim', 'scale_trim_round']
# extract separated data frames that we will write out
# as separate files
not_other_columns = set()
prediction_columns = ['spkitemid', 'sc1'] + expected_score_types
df_predictions_only = df_pred_processed[prediction_columns]
not_other_columns.update(prediction_columns)
metadata_columns = ['spkitemid'] + subgroups
if candidate_column:
metadata_columns.append('candidate')
df_test_metadata = df_filtered_pred[metadata_columns]
not_other_columns.update(metadata_columns)
df_test_human_scores = pd.DataFrame()
human_score_columns = ['spkitemid', 'sc1', 'sc2']
if second_human_score_column and 'sc2' in df_filtered_pred:
df_test_human_scores = df_filtered_pred[human_score_columns].copy()
not_other_columns.update(['sc2'])
# filter out any non-numeric values nows
# as well as zeros, if we were asked to
df_test_human_scores['sc2'] = pd.to_numeric(df_test_human_scores['sc2'],
errors='coerce').astype(float)
if exclude_zero_scores:
df_test_human_scores['sc2'] = df_test_human_scores['sc2'].replace(0, np.nan)
# remove 'spkitemid' from `not_other_columns`
# because we want that in the other columns
# data frame
not_other_columns.remove('spkitemid')
# extract all of the other columns in the predictions file
other_columns = [column for column in df_filtered_pred.columns
if column not in not_other_columns]
df_pred_other_columns = df_filtered_pred[other_columns]
new_config_dict = {'experiment_id': experiment_id,
'subgroups': subgroups,
'description': description,
'pred_file_location': pred_file_location,
'id_column': id_column,
'second_human_score_column': second_human_score_column,
'candidate_column': candidate_column,
'subgroups': subgroups,
'use_scaled_predictions': use_scaled_predictions,
'exclude_zero_scores': exclude_zero_scores,
'exclude_listwise': exclude_listwise,
'chosen_notebook_files': chosen_notebook_files}
config_as_dict = config_obj.to_dict()
config_as_dict.update(new_config_dict)
new_config = Configuration(config_as_dict, config_obj.filepath)
frames = [df_predictions_only,
df_test_metadata,
df_pred_other_columns,
df_test_human_scores,
df_excluded,
df_responses_with_excluded_flags]
names = ['pred_test',
'test_metadata',
'test_other_columns',
'test_human_scores',
'test_excluded',
'test_responses_with_excluded_flags']
new_container = [{'name': name, 'frame': frame}
for frame, name in zip(frames, names)]
new_container = DataContainer(new_container)
return new_config, new_container
[docs] def process_data_rsmpredict(self, config_obj, data_container_obj):
"""
Process data for RSM predict.
Parameters
----------
config_obj : configuration_parser.Configuration
A configuration object.
data_container_obj : container.DataContainer
A data container object.
Returns
-------
config_obj : configuration_parser.Configuration
A new configuration object.
data_congtainer : container.DataContainer
A new data container object.
Raises
------
KeyError
If columns in the config file do not exist in the data
ValueError
If data contains duplicate response IDs
"""
df_input = data_container_obj.input_features
df_feature_info = data_container_obj.feature_info
df_postproc_params = data_container_obj.postprocessing_params
# get the column name that will hold the ID
id_column = config_obj['id_column']
# get the column name for human score (if any)
human_score_column = config_obj['human_score_column']
# get the column name for second human score (if any)
second_human_score_column = config_obj['second_human_score_column']
# get the column name for subgroups (if any)
subgroups = config_obj['subgroups']
# get the model
model = config_obj['model']
# should features be standardized?
standardize_features = config_obj.get('standardize_features', True)
# should we predict expected scores
predict_expected_scores = config_obj['predict_expected_scores']
# get the column names for flag columns (if any)
flag_column_dict = config_obj.check_flag_column()
# get the name for the candidate_column (if any)
candidate_column = config_obj['candidate_column']
# make sure that the columns specified in the config file actually exist
columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())
# add subgroups and the flag columns to the list of columns
# that will be added to the final file
columns_to_copy = subgroups + list(flag_column_dict.keys())
# human_score_column will be set to sc1 by default
# we only raise an error if it's set to something else.
# However, since we cannot distinguish whether the column was set
# to sc1 by default or specified as such in the config file
# we append it to output anyway as long as
# it is in the input file
if human_score_column != 'sc1' or 'sc1' in df_input.columns:
columns_to_check.append(human_score_column)
columns_to_copy.append('sc1')
if candidate_column:
columns_to_check.append(candidate_column)
columns_to_copy.append('candidate')
if second_human_score_column:
columns_to_check.append(second_human_score_column)
columns_to_copy.append('sc2')
missing_columns = set(columns_to_check).difference(df_input.columns)
if missing_columns:
raise KeyError("Columns {} from the config file "
"do not exist in the data.".format(missing_columns))
# rename all columns
df_input = self.rename_default_columns(df_input,
[],
id_column,
human_score_column,
second_human_score_column,
None,
None,
candidate_column=candidate_column)
# check that the id_column contains unique values
if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
raise ValueError("The data contains repeated response IDs in {}. "
"Please make sure all response IDs are unique and "
"re-run the tool.".format(id_column))
(df_features_preprocessed,
df_excluded) = self.preprocess_new_data(df_input,
df_feature_info,
standardize_features)
trim_min = df_postproc_params['trim_min'].values[0]
trim_max = df_postproc_params['trim_max'].values[0]
h1_mean = df_postproc_params['h1_mean'].values[0]
h1_sd = df_postproc_params['h1_sd'].values[0]
# now generate the predictions for the features using this model
logged_str = 'Generating predictions'
logged_str += ' (expected scores).' if predict_expected_scores else '.'
logging.info(logged_str)
df_predictions = model.predict(df_features_preprocessed,
int(trim_min),
int(trim_max),
predict_expected=predict_expected_scores)
train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0]
train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]
df_predictions = self.process_predictions(df_predictions,
train_predictions_mean,
train_predictions_sd,
h1_mean,
h1_sd,
trim_min, trim_max)
# add back the columns that we were requested to copy if any
if len(columns_to_copy) > 0:
df_predictions_with_metadata = pd.merge(df_predictions,
df_input[['spkitemid'] + columns_to_copy])
assert(len(df_predictions) == len(df_predictions_with_metadata))
else:
df_predictions_with_metadata = df_predictions.copy()
datasets = [{'name': 'features_processed', 'frame': df_features_preprocessed},
{'name': 'excluded', 'frame': df_excluded},
{'name': 'predictions_with_metadata', 'frame': df_predictions_with_metadata},
{'name': 'predictions', 'frame': df_predictions}]
return config_obj, DataContainer(datasets)
[docs] def process_data(self, config_obj, data_container_obj, context='rsmtool'):
"""
Process the date for a given context.
Parameters
----------
config_obj : configuration_parser.Configuration
A configuration object.
data_container_obj : container.DataContainer
A data container object.
context : {'rsmtool', 'rsmeval', 'rsmpredict'}
The context of the tool.
Returns
-------
config_obj : configuration_parser.Configuration
A new configuration object.
data_congtainer : container.DataContainer
A new data container object.
Raises
------
ValueError
If the the context is not in {'rsmtool', 'rsmeval', 'rsmpredict'}
"""
if context == 'rsmtool':
return self.process_data_rsmtool(config_obj, data_container_obj)
elif context == 'rsmeval':
return self.process_data_rsmeval(config_obj, data_container_obj)
elif context == 'rsmpredict':
return self.process_data_rsmpredict(config_obj, data_container_obj)
else:
raise ValueError("The `context` argument must be in the set: "
"{'rsmtool', 'rsmeval', 'rsmpredict'}. "
"You passed `{}`.".format(context))
[docs] def preprocess_new_data(self,
df_input,
df_feature_info,
standardize_features=True):
"""
Process a data frame with feature values by applying
:ref:`preprocessing parameters <preprocessing_parameters>`
stored in `df_feature_info`.
Parameters
----------
df_input : pandas DataFrame
Data frame with raw feature values that will be used to generate
the scores. Each feature is stored in a separate column. Each row
corresponds to one response. There should also be a column named
`spkitemid` containing a unique ID for each response.
df_feature_info : pandas DataFrame
Data frame with preprocessing parameters stored in the following columns ::
- `feature` : the name of the feature; should match the feature names
in `df_input`.
- `sign` : `1` or `-1`. Indicates whether the feature value needs to
be multiplied by -1.
- `transform` : :ref:`transformation <json_transformation>` that needs
to be applied to this feature
- `train_mean`, `train_sd` : mean and standard deviation for outlier
truncation.
- `train_transformed_mean`,`train_transformed_sd` : mean and standard
deviation for computing `z`-scores.
standardize_features : bool, optional
Whether the features should be standardized prior to prediction.
Defaults to True.
Returns
-------
df_features_preprocessed : pd.DataFrame
Data frame with processed feature values
df_excluded: pd.DataFrame
Data frame with responses excluded from further analysis
due to non-numeric feature values in the original file
or after applying transformations. The data frame always contains the
original feature values.
Raises
------
KeyError
if some of the features specified in `df_feature_info` are not present
in `df_input`
ValueError
if all responses have at least one non-numeric feature value and therefore
no score can be generated for any of the responses.
"""
# get the list of required features
required_features = df_feature_info.index.tolist()
# ensure that all the features that are needed by the model
# are present in the input file
input_feature_columns = [c for c in df_input if c != 'spkitemid']
missing_features = set(required_features).difference(input_feature_columns)
if missing_features:
raise KeyError('The input feature file is missing the '
'following features: {}'.format(missing_features))
extra_features = set(input_feature_columns).difference(required_features + ['spkitemid'])
if extra_features:
logging.warning('The following extraenous features '
'will be ignored: {}'.format(extra_features))
# keep the required features plus the id
features_to_keep = ['spkitemid'] + required_features
# check if actually have the human scores for this data and add
# sc1 to preprocessed features for consistency with other tools
has_human_scores = 'sc1' in df_input
if has_human_scores:
features_to_keep.append('sc1')
df_features = df_input[features_to_keep]
# preprocess the feature values
logging.info('Pre-processing input features')
# first we need to filter out NaNs and any other
# weird features, the same way we did for rsmtool.
df_filtered = df_features.copy()
df_excluded = pd.DataFrame(columns=df_filtered.columns)
for feature_name in required_features:
newdf, newdf_excluded = self.filter_on_column(df_filtered,
feature_name,
'spkitemid',
exclude_zeros=False,
exclude_zero_sd=False)
del df_filtered
df_filtered = newdf
with np.errstate(divide='ignore'):
df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')
# make sure that the remaining data frame is not empty
if len(df_filtered) == 0:
raise ValueError("There are no responses left after "
"filtering out non-numeric feature values. No analysis "
"will be run")
df_features = df_filtered.copy()
df_features_preprocess = df_features.copy()
for feature_name in required_features:
feature_values = df_features_preprocess[feature_name].values
feature_transformation = df_feature_info.loc[feature_name]['transform']
feature_sign = df_feature_info.loc[feature_name]['sign']
train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
train_feature_sd = df_feature_info.loc[feature_name]['train_sd']
train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean']
train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd']
# transform the feature values and remove outliers
df_features_preprocess[feature_name] = self.preprocess_feature(feature_values,
feature_name,
feature_transformation,
train_feature_mean,
train_feature_sd,
exclude_zero_sd=False,
raise_error=False)
# filter the feature values once again to remove possible NaN and inf values that
# might have emerged when applying transformations.
# We do not need to do that if no transformation was applied.
if feature_transformation not in ['raw', 'org']:
# check that there are indeed inf or Nan values
if np.isnan(df_features_preprocess[feature_name]).any() or \
np.isinf(df_features_preprocess[feature_name]).any():
(newdf,
newdf_excluded) = self.filter_on_column(df_features_preprocess,
feature_name,
'spkitemid',
exclude_zeros=False,
exclude_zero_sd=False)
del df_features_preprocess
df_features_preprocess = newdf
# add the response(s) with missing values to the excluded responses
# but make sure we are adding the original values, not the
# preprocessed ones
missing_values = df_features['spkitemid'].isin(newdf_excluded['spkitemid'])
df_excluded_original = df_features[missing_values].copy()
df_excluded = pd.merge(df_excluded, df_excluded_original, how='outer')
# print(standardized_features)
if standardize_features:
# now standardize the feature values
df_feature_minus_mean = (df_features_preprocess[feature_name] -
train_transformed_mean)
df_features_preprocess[feature_name] = (df_feature_minus_mean /
train_transformed_sd)
# Multiply features by sign.
df_features_preprocess[feature_name] = (df_features_preprocess[feature_name] *
feature_sign)
return (df_features_preprocess, df_excluded)