Source code for rsmtool.transformer

"""
Class for transforming features.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:date: 10/25/2017
:organization: ETS
"""

import logging
import numpy as np

from scipy.stats.stats import pearsonr


[docs]class FeatureTransformer: """ Encapsulate feature transformation methods. """
[docs] @classmethod def apply_sqrt_transform(cls, name, values, raise_error=True): """ Apply the `sqrt` transform to `values`. Parameters ---------- name : str Name of the feature to transform. values : numpy array Numpy array containing the feature values. raise_error : bool, optional When set to true, raises an error if the transform is applied to a feature that can have negative values. Returns ------- new_data : numpy array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that has negative values and `raise_error` is set to true. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The sqrt transformation should not be " "applied to feature {} which can have " "negative values".format(name)) else: logging.warning("The sqrt transformation was " "applied to feature {} which has " "negative values for some responses. " "No system score will be generated " "for such responses".format(name)) with np.errstate(invalid='ignore'): new_data = np.sqrt(values) return new_data
[docs] @classmethod def apply_log_transform(cls, name, values, raise_error=True): """ Apply the `log` transform to `values`. Parameters ---------- name : str Name of the feature to transform. values : numpy array Numpy array containing the feature values. raise_error : bool, optional When set to true, raises an error if the transform is applied to a feature that has zero or negative values. Returns ------- new_data : numpy array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that can be zero or negative and `raise_error` is set to true. """ # check if the feature has any zeros if np.any(values == 0): if raise_error: raise ValueError("The log transformation should not be " "applied to feature {} which can have a " "value of 0".format(name)) else: logging.warning("The log transformation was " "applied to feature {} which has a " "value of 0 for some responses. No system " "score will " "be generated for such responses".format(name)) # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The log transformation should not be " "applied to feature {} which can have " "negative values".format(name)) else: logging.warning("The log transformation was " "applied to feature {} which has " "negative values for some responses. No system " "score will " "be generated for such responses".format(name)) new_data = np.log(values) return new_data
[docs] @classmethod def apply_inverse_transform(cls, name, values, raise_error=True, sd_multiplier=4): """ Apply the inverse transform to `values`. Parameters ---------- name : str Name of the feature to transform. values : numpy array Numpy array containing the feature values. raise_error : bool, optional When set to true, raises an error if the transform is applied to a feature that can be zero or to a feature that can have different signs. sd_multiplier : int, optional Use this std. dev. multiplier to compute the ceiling and floor for outlier removal and check that these are not equal to zero. Returns ------- new_data: numpy array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that can be zero or to a feature that can have different signs and `raise_error` is set to 'True' """ if np.any(values == 0): if raise_error: raise ValueError("The inverse transformation should not be " "applied to feature {} which can have a " "value of 0".format(name)) else: logging.warning("The inverse transformation was applied to " "feature {} which has a value of 0 for " "some responses. No system score will be " "generated for such responses".format(name)) # check if the floor or ceiling are zero data_mean = np.mean(values) data_sd = np.std(values, ddof=1) floor = data_mean - sd_multiplier * data_sd ceiling = data_mean + sd_multiplier * data_sd if floor == 0 or ceiling == 0: logging.warning("The floor/ceiling for feature {} " "is zero after applying the inverse " "transformation".format(name)) # check if the feature can be both positive and negative all_positive = np.all(np.abs(values) == values) all_negative = np.all(np.abs(values) == -values) if not (all_positive or all_negative): if raise_error: raise ValueError("The inverse transformation should not be " "applied to feature {} where the values can " "have different signs".format(name)) else: logging.warning("The inverse transformation was " "applied to feature {} where the values can" "have different signs. This can change " "the ranking of the responses".format(name)) with np.errstate(divide='ignore'): new_data = 1 / values return new_data
[docs] @classmethod def apply_add_one_inverse_transform(cls, name, values, raise_error=True): """ Apply the add one and invert transform to `values`. Parameters ---------- name : str Name of the feature to transform. values : np.array Numpy array containing the feature values. raise_error : bool, optional When set to true, raises an error if the transform is applied to a feature that has zero or negative values. Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that can be negative and `raise_error` is set to True. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The addOneInv transformation should not " "be applied to feature {} which can have " "negative values".format(name)) else: logging.warning("The addOneInv transformation was " "applied to feature {} which has " "negative values for some responses. " "This can change the ranking of the " "responses".format(name)) new_data = 1 / (values + 1) return new_data
[docs] @classmethod def apply_add_one_log_transform(cls, name, values, raise_error=True): """ Apply the add one and log transform to `values`. Parameters ---------- name : str Name of the feature to transform. values : numpy array Numpy array containing the feature values. raise_error : bool, optional When set to true, raises an error if the transform is applied to a feature that has zero or negative values. Returns ------- new_data : numpy array Numpy array that contains the transformed feature values. Raises ------ ValueError If the transform is applied to a feature that can be negative. """ # check if the feature has any negative values if np.any(values < 0): if raise_error: raise ValueError("The addOneLn transformation should not " "be applied to feature {} which can have " "negative values".format(name)) else: logging.warning("The log transformation was " "applied to feature {} which has " "negative values for some responses. " "If the feature value remains negative " "after adding one, no score will " "be generated for such responses".format(name)) new_data = np.log(values + 1) return new_data
[docs] @classmethod def transform_feature(cls, values, column_name, transform, raise_error=True): """ Applies the given transform to all of the values in the given numpy array. The values are assumed to be for the feature with the given name. Parameters ---------- values : numpy array Numpy array containing the feature values. column_name : str Name of the feature to transform. transform : str Name of the transform to apply. Valid options include :: {'inv', 'sqrt', 'log', 'addOneInv', 'addOneLn', 'raw', 'org'} raise_error : bool, optional Raise a ValueError if a transformation leads to `Inf` values or may change the ranking of the responses Returns ------- new_data : np.array Numpy array containing the transformed feature values. Raises ------ ValueError If the given transform is not recognized. Note ---- Many of these transformations may be meaningless for features which span both negative and positive values. Some transformations may throw errors for negative feature values. """ transforms = {'inv': FeatureTransformer.apply_inverse_transform, 'sqrt': FeatureTransformer.apply_sqrt_transform, 'log': FeatureTransformer.apply_log_transform, 'addOneInv': FeatureTransformer.apply_add_one_inverse_transform, 'addOneLn': FeatureTransformer.apply_add_one_log_transform, 'raw': lambda column_name, data, raise_error: data, 'org': lambda column_name, data, raise_error: data} # make sure we have a valid transform function if transform is None or transform not in transforms: raise ValueError('Unrecognized feature transformation: ' ' {}'.format(transform)) transformer = transforms.get(transform) new_data = transformer(column_name, values, raise_error) return new_data
[docs] @classmethod def find_feature_transform(cls, feature_name, feature_value, scores): """ Identify the best transformation based on the highest absolute Pearson correlation with human score. Parameters ---------- feature_name: str Name of feature for which to find the transformation. feature_value: pandas Series Series containing feature values. scores: pandas Series Numeric human scores. Returns ------- best_transformation: str The name of the transformation which gives the highest correlation between the feature values and the human scores. See :ref:`documentation <select_transformations_rsmtool>` for the full list of transformations. """ # Do not use sqrt and ln for potential negative features. # Do not use inv for positive features. if any(feature_value < 0): applicable_transformations = ['org', 'inv'] else: applicable_transformations = ['org', 'sqrt', 'addOneInv', 'addOneLn'] correlations = [] for trans in applicable_transformations: try: transformed_value = FeatureTransformer.transform_feature(feature_value, feature_name, trans) correlations.append(abs(pearsonr(transformed_value, scores)[0])) except ValueError: # If the transformation returns an error, append 0. correlations.append(0) best = np.argmax(correlations) best_transformation = applicable_transformations[best] return best_transformation