Source code for psychopy_ext.stats

#!/usr/bin/env python

# Part of the psychopy_ext library
# Copyright 2010-2014 Jonas Kubilius
# The program is distributed under the terms of the GNU General Public License,
# either version 3 of the License, or (at your option) any later version.

"""
A collection of useful descriptive and basic statistical functions for
psychology research. Some functions are meant to improve `scipy.stats`
functionality and integrate seamlessly with `pandas`.
"""

import numpy as np
import scipy.stats
import pandas

import plot

try:
    from collections import OrderedDict
except:
    from exp import OrderedDict


[docs]def aggregate(df, rows=None, cols=None, values=None, subplots=None, yerr=None, aggfunc='mean', unstacked=False, order='natural'): """ Aggregates data over specified columns. :Args: df (pandas.DataFrame) A DataFrame with your data :Kwargs: - rows (str or list of str, default: None) Name(s) of column(s) that will be aggregated and plotted on the x-axis - cols (str or list of str, default: None) Name(s) of column(s) that will be shown in the legend - values (str or list of str, default: None) Name(s) of the column(s) that is aggregated - yerr (str, default: None) Name of the column for the y-errorbar calculation. Typically, this is the column with participant IDs. - aggfunc (str or a 1d function) A function to use for aggregation. If a string, it is interpreted as a `numpy` function. - unstacked (bool, default: False) If True, returns an unstacked version of aggregated data (i.e., rows in rows and columns in columns)). Useful for printing and other non-plotting tasks. - order (str, {'natural', 'sorted'}, default: 'natural') If order is 'natural', attempts to keep the original order of the data (as in the file). Works only sometimes though. If 'sorted', then will come out sorted as is by default in pandas. :Returns: A `pandas.DataFrame` where data has been aggregated in the following MultiIndex format: - columns: - level 0: subplots - level 1 to n-2: rows - level n-1: column - rows: yerr This format makes it easy to do further computations such as mean over `yerr`: a simple `df.mean()` will do the trick. :See also: :func:`accuracy` """ df = pandas.DataFrame(df) # make sure it's a DataFrame _splits = {} if isinstance(subplots, str): subplots = [subplots] _splits['subplots'] = subplots elif subplots is None: subplots = [] if isinstance(rows, str): rows = [rows] _splits['rows'] = rows elif rows is None: rows = [] if isinstance(cols, str): cols = [cols] _splits['cols'] = cols elif cols is None: cols = [] if isinstance(yerr, str): yerr = [yerr] _splits['yerr'] = yerr elif yerr is None: yerr = [] if values is None: raise Exception('You must provide the name(s) of the column(s) that ' 'is/are aggregated.') allconds = subplots + rows + cols + yerr #allconds = [c for c in allconds if c is not None] if isinstance(aggfunc, str): try: aggfunc = getattr(np, aggfunc) except: raise agg = df.groupby(allconds)[values].aggregate(aggfunc) if len(yerr) > 0: # if yerr present, yerr is in rows, the rest in cols if isinstance(values, list): agg = agg[values[0]] for yr in yerr: agg = unstack(agg, level=yr) agg = agg.T else: agg = pandas.DataFrame(agg).T # sort data nicely if isinstance(order, dict): order_full = order else: order_full = dict([(col, order) for col in agg.columns.names]) for level, level_ord in order_full.items(): if isinstance(level_ord, str): #col = '.'.join(level.split('.')[1:]) if level_ord == 'natural': #import pdb; pdb.set_trace() thisord = df[level].unique() # preserves order elif level_ord == 'sorted': thisord = np.unique(df[level]) # doesn't preserve order else: raise Exception('Ordering %s not recognized.' % level_ord) else: thisord = level_ord order_full[level] = thisord agg = reorder(agg, order=order_full) ## rows should become rows, and cols should be cols if so desired #if yerr[0] is None and unstacked: #for row in splits['rows']: #agg = unstack(agg, split='rows') if isinstance(values, list): agg.names = values[0] else: agg.names = values agg._splits = _splits return agg
[docs]def accuracy(df, values=None, correct='correct', incorrect='incorrect', **kwargs): """ Computes accuracy given correct and incorrect data labels. :Args: df (pandas.DataFrame) Your data :Kwargs: - values (str or list of str, default: None) Name(s) of the column(s) that is aggregated - correct (str or a number or list of str or numbers, default: None) Labels that are treated as correct responses. - incorrect (str or a number or list of str or numbers, default: None) Labels that are treated as incorrect responses. - kwargs Anything else you want to pass to :func:`aggregate`. Note that ``aggfunc`` is set to ``np.size`` and you cannot change that. :Returns: A pandas.DataFrame in the format of :func:`accuracy` where the reported values are a fraction correct / (correct+incorrect). :See also: :func:`accuracy` """ if isinstance(correct, str): correct = [correct] if isinstance(incorrect, str): incorrect = [incorrect] if isinstance(values, list): values = values[0] corr = df[df[values].isin(correct)] if len(corr) == 0: raise Exception('There are no %s responses' % correct[0]) agg_corr = aggregate(corr, aggfunc=np.size, values=values, **kwargs) df_all = df[df[values].isin(correct + incorrect)] agg_all = aggregate(df_all, aggfunc=np.size, values=values, **kwargs) agg = agg_corr.astype(float) / agg_all agg.names = values agg._splits = agg_corr._splits return agg
[docs]def confidence(agg, kind='sem', within=None, alpha=.05, nsamples=None, skipna=True): """ Compute confidence of measurements, such as standard error of means (SEM) or confidence intervals (CI). :Args: agg :Kwargs: - kind ('sem', 'ci', or 'binomial', default: 'sem') .. warning:: Binomial not tested throroughly - within (str or list, default: None) For repeated measures designs, error bars are too large. Specify which dimensions come from repeated measures (rows, cols, and/or subplots). It computes within-subject confidence intervals using a method by Loftus & Masson (1994) simplified by Cousinaueu (2005) with Morey's (2008) correction. Based on `Denis A. Engemann's gist <https://gist.github.com/dengemann/6030453>`_ - alpha (float, default: .05) For CI and binomial. Computed for single-tail, so effectively alpha/2. - nsamples For binomial distribution confidence intervals if there is a single sample only (which presumably relfects the number of correct responses, i.e., successes). See `Wikipedia <http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Normal_approximation_interval>`_ - skipna (default: True) Whether to skip NA / null values or not. .. warning:: Not tested thoroughly :Returns: mean, p_yerr """ if isinstance(agg, pandas.DataFrame): mean = agg.mean(skipna=skipna) # mean across participants if within is not None: if not isinstance(within, list): within = [within] levels = [] for dim in within: tmp = [r for r in agg.columns.names if not r.startswith(dim + '.')] levels += tmp if len(levels) > 0: #raise Exception('Could not find levels that start with any of the following: %s' % within) # center data #try: try: subj_mean = agg.mean(level=levels, skipna=skipna, axis=1) # mean per subject except: subj_mean = agg.mean(skipna=skipna) else: subj_mean = agg.mean(skipna=skipna) grand_mean = mean.mean(skipna=skipna) center = subj_mean - grand_mean aggc = pandas.DataFrame(index=agg.index, columns=agg.columns) for colname, value in center.iteritems(): try: aggc[colname] # what a cool bug -- crashes without this line! except: import pdb; pdb.set_trace() aggc[colname] = (agg[colname].T - value.T).T ddof = 0 # Morey's correction else: aggc = agg.copy() ddof = 1 if skipna: count = agg.count() # skips NA by default else: count = len(agg) confint = 1 - alpha/2. if kind == 'sem': p_yerr = aggc.std(skipna=skipna, ddof=ddof) / np.sqrt(count) elif kind == 'ci': p_yerr = aggc.std(skipna=skipna, ddof=ddof) / np.sqrt(count) p_yerr *= scipy.stats.t.ppf(confint, count-1) elif kind == 'binomial': z = scipy.stats.norm.ppf(confint) p_yerr = z * np.sqrt(mean * (1-mean) / nsamples) else: # if there's only one condition? Not sure #FIXME mean = agg p_yerr = np.zeros((len(agg), 1)) return mean, p_yerr
[docs]def get_star(p): if p < .001: star = '***' elif p < .01: star = '**' elif p < .05: star = '*' else: star = '' return star
[docs]def reorder(agg, order, level=None, dim='columns'): """ Reorders rows or columns in a pandas.DataFrame. It relies on for loops, so it will be slow for large data frames. :Args: - agg (pandas.DataFrame) Your (usually aggregated) data - order (list or dict) Order of entries. A list is only accepted is `level` is given. Otherwise, order should be a dict with level names (str) as keys and a list of order in values. :Kwargs: - level (str, default: None) Which level needs to be reordered. Levels must be str. - dim (str, {'rows', 'index', 'columns'}, default: 'columns') Whether to reorder rows (or index) or columns. :Returns: Reordered pandas.DataFrame """ if dim in ['index', 'rows'] or isinstance(agg, pandas.Series): orig_idx = agg.index else: orig_idx = agg.columns if not isinstance(order, dict): order = {level: order} #import pdb; pdb.set_trace() for levelno, levelname in enumerate(orig_idx.names): if not levelname in order: # retain the existing order this_order = pandas.DataFrame(orig_idx.tolist()) #[lev[levelno] for lev in orig_idx] this_order = this_order.iloc[:, levelno].unique() # preserves order #this_order = np.unique(this_order) else: this_order = order[levelname] # convert to a list order[levelname] = [i for i in this_order] data = np.zeros((len(orig_idx), len(orig_idx.names))) ind_sort = pandas.DataFrame(index=orig_idx, data=data, columns=orig_idx.names) for rowno, (rowidx, row) in enumerate(ind_sort.iterrows()): if not isinstance(rowidx, tuple): levelname = row.index[0] row[levelname] = order[levelname].index(rowidx) else: for i, item in enumerate(rowidx): levelname = row.index[i] row[levelname] = order[levelname].index(item) ind_sort.sort(columns=ind_sort.columns.tolist(), inplace=True) multidx = ind_sort.index if dim in ['index', 'rows'] or isinstance(agg, pandas.Series): agg_new = agg.reindex(index=multidx) else: agg_new = agg.reindex(columns=multidx) return agg_new
def unstack(data, level=-1): """ Unstacking that preserves label order, unlike in pandas. """ if isinstance(level, (str, int)): levels = [level] else: levels = level unstacked = data.copy() # reorder the newly unstacked levels for lev in levels: order = data.index.get_level_values(lev).unique() unstacked = unstacked.unstack(level=lev) unstacked = reorder(unstacked, order=order, level=lev, dim='columns') return unstacked def stack(data, level=-1): """ Stacking that preserves label order, unlike in pandas. """ if not isinstance(level, (list, tuple)): levels = [level] else: levels = level stacked = data.copy() for lev in levels: order = data.columns.get_level_values(lev).unique() stacked = stacked.stack(level=lev) stacked = reorder(stacked, order=order, level=lev, dim='index') return stacked
[docs]def df_fromdict(data, repeat=1): """ Produces a factorial DataFrame from a dict or list of tuples. For example, suppose you want to generate a DataFrame like this:: a b 0 one 0 1 one 1 2 two 0 3 two 1 This function generates such output simply by providing the following: df_fromdict([('a', ['one', 'two']), ('b', [0, 1])]) :Args: data: dict or a list of tuples Data used to produce a DataFrame. Keys specify column names, and values specify possible (unique) values. :Kwargs: repeat: int (default: 1) How many times everything should be repeated. Useful if you want to simulate multiple samples of each condition, for example. :Returns: pandas.DataFrame with data.items() column names """ data = OrderedDict(data) count = map(len, data.values()) df = {} for i, (key, vals) in enumerate(data.items()): rep = np.repeat(vals, np.product(count[i+1:])) tile = np.tile(rep, np.product(count[:i])) df[key] = np.repeat(tile, repeat) df = pandas.DataFrame(df, columns=data.keys()) return df
def _aggregate_panel(df, rows=None, cols=None, values=None, value_filter=None, yerr=None, func='mean'): """ Aggregates `pandas.DataFrame` over specified dimensions into `pandas.Panel`. Forces output to always be a Panel. Shape: - items: `yerr` - major_xs: `rows` - minor_xs: `cols` TODO: convert between DataFrame/Panel upon request TODO: use `reindexing <reindex <http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-reindexing-and-alignment-with-hierarchical-index>`_ to keep a stable order :Args: df (pandas.DataFrame): A DataFrame with your data :Kwargs: - rows (str or list of str): Name(s) of column(s) that will be aggregated and plotted on the x-axis - columns (str or list of str): ame(s) of column(s) that will be aggregated and plotted in the legend - values (str): Name of the column that are aggregated - yerr (str): Name of the column to group reponses by. Typically, this is the column with subject IDs to remove outliers for each participant separately (based on their mean and std) :Returns: A `pandas.Panel` """ if type(rows) != list: rows = [rows] #if yerr is None: #yerr = [] if yerr is not None: if type(yerr) in [list, tuple]: if len(yerr) > 1: raise ValueError('Only one field can be used for calculating' 'error bars.') else: yerr = yerr[0] if cols is None: if yerr is None: allconds = rows else: allconds = rows + [yerr] else: if type(cols) != list: cols = [cols] if yerr is None: allconds = rows + cols else: allconds = rows + cols + [yerr] if yerr is None: panel = _agg_df(df, rows=rows, cols=cols, values=values, value_filter=value_filter, func=func) else: if df[values].dtype in [str, object]: # calculate accuracy size = df.groupby(allconds)[values].size() if value_filter is not None: dff = df[df[values] == value_filter] else: raise Exception('value_filter must be defined when aggregating ' 'over str or object types') size_filter = dff.groupby(allconds)[values].size() agg = size_filter / size.astype(float) else: if func == 'mean': agg = df.groupby(allconds)[values].mean() elif func == 'median': agg = df.groupby(allconds)[values].median() agg = unstack(agg, level=yerr) #agg.unstack(yerr) columns = agg.columns panel = {} for col in columns: if cols is None: #if yerr is None: #df_col = pandas.DataFrame({'data': agg}) #else: df_col = pandas.DataFrame({'data': agg[col]}) panel[col] = df_col else: df_col = agg[col].reset_index() #import pdb; pdb.set_trace() panel[col] = pandas.pivot_table(df_col, rows=rows, cols=cols, values=col) return pandas.Panel(panel) def _agg_df(df, rows=None, cols=None, values=None, value_filter=None, func='mean'): if df[values].dtype in [str, object]: # calculate accuracy size = pandas.pivot_table(df, rows=rows, cols=cols, values=values, aggfunc=np.size) #df.groupby(allconds)[values].size() if value_filter is not None: dff = df[df[values] == value_filter] else: raise Exception('value_filter must be defined when aggregating ' 'over str or object types') size_filter = pandas.pivot_table(dff, rows=rows, cols=cols, values=values, aggfunc=np.size) agg = size_filter / size.astype(float) else: if func == 'mean': aggfunc = np.mean elif func == 'median': aggfunc = np.median agg = pandas.pivot_table(df, rows=rows, cols=cols, values=values, aggfunc=aggfunc) return {'column': agg}
[docs]def nan_outliers(df, values=None, group=None): """ Remove outliers 3 standard deviations away from the mean Args: df (pandas.DataFrame): A DataFrame with your data Kwargs: values (str): Name of the column that needs to have outliers removed group (str): Name of the column to group reponses by. Typically, this is the column with subject IDs, so that you remove outliers for each participant separately (based on their mean and std) Returns: df (pandas.DataFrame): A DataFrame without outliers """ zscore = lambda x: (x - x.mean()) / x.std() tdf = df.groupby(group)[values].transform(zscore) df[values] = np.select([tdf<-3, tdf>3],[np.nan, np.nan], default=df[values]) return df
def stats_test(agg, test='ttest'): #import pdb; pdb.set_trace() array1 = agg.ix[:, 0:len(agg.columns):2] array2 = agg.ix[:, 1:len(agg.columns):2] t, p = scipy.stats.ttest_rel(array1, array2, axis=0) #import pdb; pdb.set_trace() print agg.columns.levels[0].values.tolist() print 't = %.2f, p = %.5f, df = %d' % (t[0], p[0], len(agg)-1) return d = agg.shape[0] if test == 'ttest': # 2-tail T-Test ttest = (np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])), np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2]))) ii = 0 for c1 in range(agg.shape[1]): for c2 in range(c1+1,agg.shape[1]): thisTtest = stats.ttest_rel(agg[:,c1,:], agg[:,c2,:], axis = 0) ttest[0][ii,:] = thisTtest[0] ttest[1][ii,:] = thisTtest[1] ii += 1 ttestPrint(title = '**** 2-tail T-Test of related samples ****', values = ttest, plotOpt = plotOpt, type = 2) elif test == 'ttest_1samp': # One-sample t-test m = .5 oneSample = stats.ttest_1samp(agg, m, axis = 0) ttestPrint(title = '**** One-sample t-test: difference from %.2f ****' %m, values = oneSample, plotOpt = plotOpt, type = 1) elif test == 'binomial': # Binomial test binom = np.apply_along_axis(stats.binom_test,0,agg) print binom return binom def ttestPrint(title = '****', values = None, xticklabels = None, legend = None, bon = None): d = 8 # check if there are any negative t values (for formatting purposes) if np.any([np.any(val < 0) for val in values]): neg = True else: neg = False print '\n' + title for xi, xticklabel in enumerate(xticklabels): print xticklabel maxleg = max([len(leg) for leg in legend]) # if type == 1: legendnames = ['%*s' %(maxleg,p) for p in plotOpt['subplot']['legend.names']] # elif type == 2: pairs = q.combinations(legend,2) legendnames = ['%*s' %(maxleg,p[0]) + ' vs ' + '%*s' %(maxleg,p[1]) for p in pairs] #print legendnames for yi, legendname in enumerate(legendnames): if values[0].ndim == 1: t = values[0][xi] p = values[1][xi] else: t = values[0][yi,xi] p = values[1][yi,xi] if p < .001/bon: star = '***' elif p < .01/bon: star = '**' elif p < .05/bon: star = '*' else: star = '' if neg and t > 0: outputStr = ' %(s)s: t(%(d)d) = %(t).3f, p = %(p).3f %(star)s' else: outputStr = ' %(s)s: t(%(d)d) = %(t).3f, p = %(p).3f %(star)s' print outputStr \ %{'s': legendname, 'd':(d-1), 't': t, 'p': p, 'star': star}
[docs]def oneway_anova(data): """ Calculates one-way ANOVA on a pandas.DataFrame. :Args: data: `pandas.DataFrame`: rows contain groups (e.g., different conditions), columns have samples (e.g., participants) :Returns: F: float F-value p: float p-value k-1: int Between Group degrees of freedom N-k: int Within Group degrees of freedom """ F, p = scipy.stats.f_oneway(*[d[1] for d in data.iterrows()]) k = len(data) # number of conditions N = k*len(data.columns) # conditions times participants return F, p, k-1, N-k
[docs]def p_corr(df1, df2=None): """ Computes Pearson correlation and its significance (using a t distribution) on a pandas.DataFrame. Ignores null values when computing significance. Based on `this Wikipedia entry <http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Testing_using_Student.27s_t-distribution>`_ :Args: df1: `pandas.DataFrame` one dataset df2: `pandas.DataFrame` another dataset :Returns: corr: float correlation between the two datasets t: float an associated t-value p: float one-tailed p-value that the two datasets differ """ if df2 is not None: if isinstance(df2, pandas.Series): df2 = pandas.DataFrame(df2) elif df2.shape[1] > 1: raise Exception('Cannot correlate two DataFrames') corrs = df1.corrwith(df2) else: corrs = df1.corr() Ns = corrs.copy() ts = corrs.copy() ps = corrs.copy() for colname, col in corrs.iteritems(): for rowname, corr in col.iteritems(): N = min(df1[colname].count(), df1[rowname].count()) Ns.loc[rowname, colname] = N t = corr * np.sqrt((N-2) / (1-corr**2)) ts.loc[rowname, colname] = t ps.loc[rowname, colname] = scipy.stats.t.sf(abs(t), N-2) # one-tailed #import pdb; pdb.set_trace() #N = np.sum(df1.notnull()) #t = corr * (N-2).applymap(np.sqrt) / (1-corr**2)) #p = 1-scipy.stats.t.cdf(t.abs(), N-2) # one-tailed return corrs, ts, ps
[docs]def reliability(panel, level=1, niter=100, func='mean'): """ Computes data reliability by splitting it a data set into two random subsets of half the set size """ if func == 'corr': pass else: try: getattr(np, func) except: raise subjIDs = panel.items.tolist() corrs = [] for n in range(niter): np.random.shuffle(subjIDs) split1inds = subjIDs[:len(subjIDs)/2] split2inds = subjIDs[len(subjIDs)/2:] split1 = pandas.concat([panel[item] for item in panel.items if item in split1inds]) split2 = pandas.concat([panel[item] for item in panel.items if item in split2inds]) split1 = split1.mean(0) split2 = split2.mean(0) for lev in range(level): split1 = split1.stack() split2 = split2.stack() corrs.append(split1.corr(split2)) N = np.sum(split1.notnull()) corr = np.mean(corrs) t = corr*np.sqrt((N-2)/(1-corr**2)) p = 1-scipy.stats.t.cdf(abs(t),N-2) # one-tailed p = 2*p/(1.+p) # Spearman-Brown prediction for twice # the amount of data # we need this because we use only half # of the data here return corr, t, p
[docs]def mds(mean, ndim=2): if mean.ndim != 2: raise Exception('Similarity matrix for MDS must have exactly 2 ' 'dimensions but only %d were found.' % mean.ndim) res = classical_mds(np.array(mean), ndim=ndim) if ndim <= 3: columns = ['x', 'y', 'z'] else: columns = ['x%d' % i for i in range(ndim)] res = pandas.DataFrame(res, index=mean.index, columns=columns[:ndim]) res.columns.names = ['cols.mds'] return res
[docs]def classical_mds(d, ndim=2): """ Metric Unweighted Classical Multidimensional Scaling Based on Forrest W. Young's notes on Torgerson's (1952) algorithm as presented in http://forrest.psych.unc.edu/teaching/p230/Torgerson.pdf: Step 0: Make data matrix symmetric with zeros on the diagonal Step 1: Double center the data matrix (d) to obtain B by removing row and column means and adding the grand mean of the squared data Step 2: Solve B = U * L * U.T for U and L Step 3: Calculate X = U * L**(-.5) :Args: - d: `numpy.ndarray` A symmetric dissimilarity matrix - ndim (int, default: 2) The number of dimensions to project to :Kwargs: X[:, :ndim]: `numpy.ndarray` The projection of d into ndim dimensions :Returns: A `numpy.array` with `ndim` columns representing the multidimensionally scaled data. """ # Step 0 # make distances symmetric d = (d + d.T) / 2 # diagonal must be 0 np.fill_diagonal(d, 0) # Step 1: Double centering # remove row and column mean and add grand mean of d**2 oo = d**2 rowmean = np.tile(np.mean(oo, 1), (oo.shape[1],1)).T colmean = np.mean(oo, 0) B = -.5 * (oo - rowmean - colmean + np.mean(oo)) # Step2: do singular value decomposition # find U (eigenvectors) and L (eigenvalues) [U, L, V] = np.linalg.svd(B) # L is already sorted (desceding) # Step 3: X = U*L**(-.5) X = U * np.sqrt(L) return X[:,:ndim]
if __name__ == '__main__': pass