#!/usr/bin/env python
# Part of the psychopy_ext library
# Copyright 2010-2014 Jonas Kubilius
# The program is distributed under the terms of the GNU General Public License,
# either version 3 of the License, or (at your option) any later version.
"""
A collection of useful descriptive and basic statistical functions for
psychology research. Some functions are meant to improve `scipy.stats`
functionality and integrate seamlessly with `pandas`.
"""
import numpy as np
import scipy.stats
import pandas
import plot
try:
from collections import OrderedDict
except:
from exp import OrderedDict
[docs]def aggregate(df, rows=None, cols=None, values=None,
subplots=None, yerr=None, aggfunc='mean', unstacked=False,
order='natural'):
"""
Aggregates data over specified columns.
:Args:
df (pandas.DataFrame)
A DataFrame with your data
:Kwargs:
- rows (str or list of str, default: None)
Name(s) of column(s) that will be aggregated and plotted on the x-axis
- cols (str or list of str, default: None)
Name(s) of column(s) that will be shown in the legend
- values (str or list of str, default: None)
Name(s) of the column(s) that is aggregated
- yerr (str, default: None)
Name of the column for the y-errorbar calculation. Typically,
this is the column with participant IDs.
- aggfunc (str or a 1d function)
A function to use for aggregation. If a string, it is interpreted
as a `numpy` function.
- unstacked (bool, default: False)
If True, returns an unstacked version of aggregated data (i.e.,
rows in rows and columns in columns)). Useful for printing and
other non-plotting tasks.
- order (str, {'natural', 'sorted'}, default: 'natural')
If order is 'natural', attempts to keep the original order of the
data (as in the file). Works only sometimes though. If 'sorted',
then will come out sorted as is by default in pandas.
:Returns:
A `pandas.DataFrame` where data has been aggregated in the following
MultiIndex format:
- columns:
- level 0: subplots
- level 1 to n-2: rows
- level n-1: column
- rows:
yerr
This format makes it easy to do further computations such as mean over
`yerr`: a simple `df.mean()` will do the trick.
:See also:
:func:`accuracy`
"""
df = pandas.DataFrame(df) # make sure it's a DataFrame
_splits = {}
if isinstance(subplots, str):
subplots = [subplots]
_splits['subplots'] = subplots
elif subplots is None:
subplots = []
if isinstance(rows, str):
rows = [rows]
_splits['rows'] = rows
elif rows is None:
rows = []
if isinstance(cols, str):
cols = [cols]
_splits['cols'] = cols
elif cols is None:
cols = []
if isinstance(yerr, str):
yerr = [yerr]
_splits['yerr'] = yerr
elif yerr is None:
yerr = []
if values is None:
raise Exception('You must provide the name(s) of the column(s) that '
'is/are aggregated.')
allconds = subplots + rows + cols + yerr
#allconds = [c for c in allconds if c is not None]
if isinstance(aggfunc, str):
try:
aggfunc = getattr(np, aggfunc)
except:
raise
agg = df.groupby(allconds)[values].aggregate(aggfunc)
if len(yerr) > 0: # if yerr present, yerr is in rows, the rest in cols
if isinstance(values, list):
agg = agg[values[0]]
for yr in yerr:
agg = unstack(agg, level=yr)
agg = agg.T
else:
agg = pandas.DataFrame(agg).T
# sort data nicely
if isinstance(order, dict):
order_full = order
else:
order_full = dict([(col, order) for col in agg.columns.names])
for level, level_ord in order_full.items():
if isinstance(level_ord, str):
#col = '.'.join(level.split('.')[1:])
if level_ord == 'natural':
#import pdb; pdb.set_trace()
thisord = df[level].unique() # preserves order
elif level_ord == 'sorted':
thisord = np.unique(df[level]) # doesn't preserve order
else:
raise Exception('Ordering %s not recognized.' % level_ord)
else:
thisord = level_ord
order_full[level] = thisord
agg = reorder(agg, order=order_full)
## rows should become rows, and cols should be cols if so desired
#if yerr[0] is None and unstacked:
#for row in splits['rows']:
#agg = unstack(agg, split='rows')
if isinstance(values, list):
agg.names = values[0]
else:
agg.names = values
agg._splits = _splits
return agg
[docs]def accuracy(df, values=None, correct='correct', incorrect='incorrect', **kwargs):
"""
Computes accuracy given correct and incorrect data labels.
:Args:
df (pandas.DataFrame)
Your data
:Kwargs:
- values (str or list of str, default: None)
Name(s) of the column(s) that is aggregated
- correct (str or a number or list of str or numbers, default: None)
Labels that are treated as correct responses.
- incorrect (str or a number or list of str or numbers, default: None)
Labels that are treated as incorrect responses.
- kwargs
Anything else you want to pass to :func:`aggregate`. Note that
``aggfunc`` is set to ``np.size`` and you cannot change that.
:Returns:
A pandas.DataFrame in the format of :func:`accuracy` where the
reported values are a fraction correct / (correct+incorrect).
:See also:
:func:`accuracy`
"""
if isinstance(correct, str):
correct = [correct]
if isinstance(incorrect, str):
incorrect = [incorrect]
if isinstance(values, list):
values = values[0]
corr = df[df[values].isin(correct)]
if len(corr) == 0:
raise Exception('There are no %s responses' % correct[0])
agg_corr = aggregate(corr, aggfunc=np.size, values=values, **kwargs)
df_all = df[df[values].isin(correct + incorrect)]
agg_all = aggregate(df_all, aggfunc=np.size, values=values, **kwargs)
agg = agg_corr.astype(float) / agg_all
agg.names = values
agg._splits = agg_corr._splits
return agg
[docs]def confidence(agg, kind='sem', within=None, alpha=.05, nsamples=None,
skipna=True):
"""
Compute confidence of measurements, such as standard error of means (SEM)
or confidence intervals (CI).
:Args:
agg
:Kwargs:
- kind ('sem', 'ci', or 'binomial', default: 'sem')
.. warning:: Binomial not tested throroughly
- within (str or list, default: None)
For repeated measures designs, error bars are too large.
Specify which dimensions come from repeated measures
(rows, cols, and/or subplots). It computes
within-subject confidence intervals using a method by
Loftus & Masson (1994) simplified by Cousinaueu (2005)
with Morey's (2008) correction.
Based on `Denis A. Engemann's gist <https://gist.github.com/dengemann/6030453>`_
- alpha (float, default: .05)
For CI and binomial. Computed for single-tail, so effectively
alpha/2.
- nsamples
For binomial distribution confidence intervals if there is
a single sample only (which presumably relfects the number
of correct responses, i.e., successes). See `Wikipedia
<http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Normal_approximation_interval>`_
- skipna (default: True)
Whether to skip NA / null values or not.
.. warning:: Not tested thoroughly
:Returns:
mean, p_yerr
"""
if isinstance(agg, pandas.DataFrame):
mean = agg.mean(skipna=skipna) # mean across participants
if within is not None:
if not isinstance(within, list):
within = [within]
levels = []
for dim in within:
tmp = [r for r in agg.columns.names if not r.startswith(dim + '.')]
levels += tmp
if len(levels) > 0:
#raise Exception('Could not find levels that start with any of the following: %s' % within)
# center data
#try:
try:
subj_mean = agg.mean(level=levels, skipna=skipna, axis=1) # mean per subject
except:
subj_mean = agg.mean(skipna=skipna)
else:
subj_mean = agg.mean(skipna=skipna)
grand_mean = mean.mean(skipna=skipna)
center = subj_mean - grand_mean
aggc = pandas.DataFrame(index=agg.index, columns=agg.columns)
for colname, value in center.iteritems():
try:
aggc[colname] # what a cool bug -- crashes without this line!
except:
import pdb; pdb.set_trace()
aggc[colname] = (agg[colname].T - value.T).T
ddof = 0 # Morey's correction
else:
aggc = agg.copy()
ddof = 1
if skipna:
count = agg.count() # skips NA by default
else:
count = len(agg)
confint = 1 - alpha/2.
if kind == 'sem':
p_yerr = aggc.std(skipna=skipna, ddof=ddof) / np.sqrt(count)
elif kind == 'ci':
p_yerr = aggc.std(skipna=skipna, ddof=ddof) / np.sqrt(count)
p_yerr *= scipy.stats.t.ppf(confint, count-1)
elif kind == 'binomial':
z = scipy.stats.norm.ppf(confint)
p_yerr = z * np.sqrt(mean * (1-mean) / nsamples)
else: # if there's only one condition? Not sure #FIXME
mean = agg
p_yerr = np.zeros((len(agg), 1))
return mean, p_yerr
[docs]def get_star(p):
if p < .001:
star = '***'
elif p < .01:
star = '**'
elif p < .05:
star = '*'
else:
star = ''
return star
[docs]def reorder(agg, order, level=None, dim='columns'):
"""
Reorders rows or columns in a pandas.DataFrame.
It relies on for loops, so it will be slow for large data frames.
:Args:
- agg (pandas.DataFrame)
Your (usually aggregated) data
- order (list or dict)
Order of entries. A list is only accepted is `level` is
given. Otherwise, order should be a dict with level names
(str) as keys and a list of order in values.
:Kwargs:
- level (str, default: None)
Which level needs to be reordered. Levels must be str.
- dim (str, {'rows', 'index', 'columns'}, default: 'columns')
Whether to reorder rows (or index) or columns.
:Returns:
Reordered pandas.DataFrame
"""
if dim in ['index', 'rows'] or isinstance(agg, pandas.Series):
orig_idx = agg.index
else:
orig_idx = agg.columns
if not isinstance(order, dict):
order = {level: order}
#import pdb; pdb.set_trace()
for levelno, levelname in enumerate(orig_idx.names):
if not levelname in order:
# retain the existing order
this_order = pandas.DataFrame(orig_idx.tolist()) #[lev[levelno] for lev in orig_idx]
this_order = this_order.iloc[:, levelno].unique() # preserves order
#this_order = np.unique(this_order)
else:
this_order = order[levelname]
# convert to a list
order[levelname] = [i for i in this_order]
data = np.zeros((len(orig_idx), len(orig_idx.names)))
ind_sort = pandas.DataFrame(index=orig_idx, data=data, columns=orig_idx.names)
for rowno, (rowidx, row) in enumerate(ind_sort.iterrows()):
if not isinstance(rowidx, tuple):
levelname = row.index[0]
row[levelname] = order[levelname].index(rowidx)
else:
for i, item in enumerate(rowidx):
levelname = row.index[i]
row[levelname] = order[levelname].index(item)
ind_sort.sort(columns=ind_sort.columns.tolist(), inplace=True)
multidx = ind_sort.index
if dim in ['index', 'rows'] or isinstance(agg, pandas.Series):
agg_new = agg.reindex(index=multidx)
else:
agg_new = agg.reindex(columns=multidx)
return agg_new
def unstack(data, level=-1):
"""
Unstacking that preserves label order, unlike in pandas.
"""
if isinstance(level, (str, int)):
levels = [level]
else:
levels = level
unstacked = data.copy()
# reorder the newly unstacked levels
for lev in levels:
order = data.index.get_level_values(lev).unique()
unstacked = unstacked.unstack(level=lev)
unstacked = reorder(unstacked, order=order, level=lev, dim='columns')
return unstacked
def stack(data, level=-1):
"""
Stacking that preserves label order, unlike in pandas.
"""
if not isinstance(level, (list, tuple)):
levels = [level]
else:
levels = level
stacked = data.copy()
for lev in levels:
order = data.columns.get_level_values(lev).unique()
stacked = stacked.stack(level=lev)
stacked = reorder(stacked, order=order, level=lev, dim='index')
return stacked
[docs]def df_fromdict(data, repeat=1):
"""
Produces a factorial DataFrame from a dict or list of tuples.
For example, suppose you want to generate a DataFrame like this::
a b
0 one 0
1 one 1
2 two 0
3 two 1
This function generates such output simply by providing the following:
df_fromdict([('a', ['one', 'two']), ('b', [0, 1])])
:Args:
data: dict or a list of tuples
Data used to produce a DataFrame. Keys specify column names, and
values specify possible (unique) values.
:Kwargs:
repeat: int (default: 1)
How many times everything should be repeated. Useful if you want to
simulate multiple samples of each condition, for example.
:Returns:
pandas.DataFrame with data.items() column names
"""
data = OrderedDict(data)
count = map(len, data.values())
df = {}
for i, (key, vals) in enumerate(data.items()):
rep = np.repeat(vals, np.product(count[i+1:]))
tile = np.tile(rep, np.product(count[:i]))
df[key] = np.repeat(tile, repeat)
df = pandas.DataFrame(df, columns=data.keys())
return df
def _aggregate_panel(df, rows=None, cols=None, values=None,
value_filter=None, yerr=None, func='mean'):
"""
Aggregates `pandas.DataFrame` over specified dimensions into `pandas.Panel`.
Forces output to always be a Panel.
Shape:
- items: `yerr`
- major_xs: `rows`
- minor_xs: `cols`
TODO: convert between DataFrame/Panel upon request
TODO: use `reindexing <reindex <http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-reindexing-and-alignment-with-hierarchical-index>`_
to keep a stable order
:Args:
df (pandas.DataFrame): A DataFrame with your data
:Kwargs:
- rows (str or list of str): Name(s) of column(s) that will be
aggregated and plotted on the x-axis
- columns (str or list of str): ame(s) of column(s) that will be
aggregated and plotted in the legend
- values (str): Name of the column that are aggregated
- yerr (str): Name of the column to group reponses by. Typically,
this is the column with subject IDs to remove outliers for each
participant separately (based on their mean and std)
:Returns:
A `pandas.Panel`
"""
if type(rows) != list: rows = [rows]
#if yerr is None:
#yerr = []
if yerr is not None:
if type(yerr) in [list, tuple]:
if len(yerr) > 1:
raise ValueError('Only one field can be used for calculating'
'error bars.')
else:
yerr = yerr[0]
if cols is None:
if yerr is None:
allconds = rows
else:
allconds = rows + [yerr]
else:
if type(cols) != list: cols = [cols]
if yerr is None:
allconds = rows + cols
else:
allconds = rows + cols + [yerr]
if yerr is None:
panel = _agg_df(df, rows=rows, cols=cols, values=values,
value_filter=value_filter, func=func)
else:
if df[values].dtype in [str, object]: # calculate accuracy
size = df.groupby(allconds)[values].size()
if value_filter is not None:
dff = df[df[values] == value_filter]
else:
raise Exception('value_filter must be defined when aggregating '
'over str or object types')
size_filter = dff.groupby(allconds)[values].size()
agg = size_filter / size.astype(float)
else:
if func == 'mean':
agg = df.groupby(allconds)[values].mean()
elif func == 'median':
agg = df.groupby(allconds)[values].median()
agg = unstack(agg, level=yerr) #agg.unstack(yerr)
columns = agg.columns
panel = {}
for col in columns:
if cols is None:
#if yerr is None:
#df_col = pandas.DataFrame({'data': agg})
#else:
df_col = pandas.DataFrame({'data': agg[col]})
panel[col] = df_col
else:
df_col = agg[col].reset_index()
#import pdb; pdb.set_trace()
panel[col] = pandas.pivot_table(df_col, rows=rows, cols=cols,
values=col)
return pandas.Panel(panel)
def _agg_df(df, rows=None, cols=None, values=None,
value_filter=None, func='mean'):
if df[values].dtype in [str, object]: # calculate accuracy
size = pandas.pivot_table(df, rows=rows, cols=cols, values=values,
aggfunc=np.size)
#df.groupby(allconds)[values].size()
if value_filter is not None:
dff = df[df[values] == value_filter]
else:
raise Exception('value_filter must be defined when aggregating '
'over str or object types')
size_filter = pandas.pivot_table(dff, rows=rows, cols=cols, values=values,
aggfunc=np.size)
agg = size_filter / size.astype(float)
else:
if func == 'mean':
aggfunc = np.mean
elif func == 'median':
aggfunc = np.median
agg = pandas.pivot_table(df, rows=rows, cols=cols, values=values,
aggfunc=aggfunc)
return {'column': agg}
[docs]def nan_outliers(df, values=None, group=None):
"""
Remove outliers 3 standard deviations away from the mean
Args:
df (pandas.DataFrame): A DataFrame with your data
Kwargs:
values (str): Name of the column that needs to have outliers removed
group (str): Name of the column to group reponses by. Typically, this
is the column with subject IDs, so that you remove outliers for each
participant separately (based on their mean and std)
Returns:
df (pandas.DataFrame): A DataFrame without outliers
"""
zscore = lambda x: (x - x.mean()) / x.std()
tdf = df.groupby(group)[values].transform(zscore)
df[values] = np.select([tdf<-3, tdf>3],[np.nan, np.nan],
default=df[values])
return df
def stats_test(agg, test='ttest'):
#import pdb; pdb.set_trace()
array1 = agg.ix[:, 0:len(agg.columns):2]
array2 = agg.ix[:, 1:len(agg.columns):2]
t, p = scipy.stats.ttest_rel(array1, array2, axis=0)
#import pdb; pdb.set_trace()
print agg.columns.levels[0].values.tolist()
print 't = %.2f, p = %.5f, df = %d' % (t[0], p[0], len(agg)-1)
return
d = agg.shape[0]
if test == 'ttest':
# 2-tail T-Test
ttest = (np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])),
np.zeros((agg.shape[1]*(agg.shape[1]-1)/2, agg.shape[2])))
ii = 0
for c1 in range(agg.shape[1]):
for c2 in range(c1+1,agg.shape[1]):
thisTtest = stats.ttest_rel(agg[:,c1,:], agg[:,c2,:], axis = 0)
ttest[0][ii,:] = thisTtest[0]
ttest[1][ii,:] = thisTtest[1]
ii += 1
ttestPrint(title = '**** 2-tail T-Test of related samples ****',
values = ttest, plotOpt = plotOpt,
type = 2)
elif test == 'ttest_1samp':
# One-sample t-test
m = .5
oneSample = stats.ttest_1samp(agg, m, axis = 0)
ttestPrint(title = '**** One-sample t-test: difference from %.2f ****' %m,
values = oneSample, plotOpt = plotOpt, type = 1)
elif test == 'binomial':
# Binomial test
binom = np.apply_along_axis(stats.binom_test,0,agg)
print binom
return binom
def ttestPrint(title = '****', values = None, xticklabels = None, legend = None, bon = None):
d = 8
# check if there are any negative t values (for formatting purposes)
if np.any([np.any(val < 0) for val in values]): neg = True
else: neg = False
print '\n' + title
for xi, xticklabel in enumerate(xticklabels):
print xticklabel
maxleg = max([len(leg) for leg in legend])
# if type == 1: legendnames = ['%*s' %(maxleg,p) for p in plotOpt['subplot']['legend.names']]
# elif type == 2:
pairs = q.combinations(legend,2)
legendnames = ['%*s' %(maxleg,p[0]) + ' vs ' + '%*s' %(maxleg,p[1]) for p in pairs]
#print legendnames
for yi, legendname in enumerate(legendnames):
if values[0].ndim == 1:
t = values[0][xi]
p = values[1][xi]
else:
t = values[0][yi,xi]
p = values[1][yi,xi]
if p < .001/bon: star = '***'
elif p < .01/bon: star = '**'
elif p < .05/bon: star = '*'
else: star = ''
if neg and t > 0:
outputStr = ' %(s)s: t(%(d)d) = %(t).3f, p = %(p).3f %(star)s'
else:
outputStr = ' %(s)s: t(%(d)d) = %(t).3f, p = %(p).3f %(star)s'
print outputStr \
%{'s': legendname, 'd':(d-1), 't': t,
'p': p, 'star': star}
[docs]def oneway_anova(data):
"""
Calculates one-way ANOVA on a pandas.DataFrame.
:Args:
data: `pandas.DataFrame`:
rows contain groups (e.g., different conditions),
columns have samples (e.g., participants)
:Returns:
F: float
F-value
p: float
p-value
k-1: int
Between Group degrees of freedom
N-k: int
Within Group degrees of freedom
"""
F, p = scipy.stats.f_oneway(*[d[1] for d in data.iterrows()])
k = len(data) # number of conditions
N = k*len(data.columns) # conditions times participants
return F, p, k-1, N-k
[docs]def p_corr(df1, df2=None):
"""
Computes Pearson correlation and its significance (using a t
distribution) on a pandas.DataFrame.
Ignores null values when computing significance. Based on
`this Wikipedia entry <http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient#Testing_using_Student.27s_t-distribution>`_
:Args:
df1: `pandas.DataFrame`
one dataset
df2: `pandas.DataFrame`
another dataset
:Returns:
corr: float
correlation between the two datasets
t: float
an associated t-value
p: float
one-tailed p-value that the two datasets differ
"""
if df2 is not None:
if isinstance(df2, pandas.Series):
df2 = pandas.DataFrame(df2)
elif df2.shape[1] > 1:
raise Exception('Cannot correlate two DataFrames')
corrs = df1.corrwith(df2)
else:
corrs = df1.corr()
Ns = corrs.copy()
ts = corrs.copy()
ps = corrs.copy()
for colname, col in corrs.iteritems():
for rowname, corr in col.iteritems():
N = min(df1[colname].count(), df1[rowname].count())
Ns.loc[rowname, colname] = N
t = corr * np.sqrt((N-2) / (1-corr**2))
ts.loc[rowname, colname] = t
ps.loc[rowname, colname] = scipy.stats.t.sf(abs(t), N-2) # one-tailed
#import pdb; pdb.set_trace()
#N = np.sum(df1.notnull())
#t = corr * (N-2).applymap(np.sqrt) / (1-corr**2))
#p = 1-scipy.stats.t.cdf(t.abs(), N-2) # one-tailed
return corrs, ts, ps
[docs]def reliability(panel, level=1, niter=100, func='mean'):
"""
Computes data reliability by splitting it a data set into two random
subsets of half the set size
"""
if func == 'corr':
pass
else:
try:
getattr(np, func)
except:
raise
subjIDs = panel.items.tolist()
corrs = []
for n in range(niter):
np.random.shuffle(subjIDs)
split1inds = subjIDs[:len(subjIDs)/2]
split2inds = subjIDs[len(subjIDs)/2:]
split1 = pandas.concat([panel[item] for item in panel.items
if item in split1inds])
split2 = pandas.concat([panel[item] for item in panel.items
if item in split2inds])
split1 = split1.mean(0)
split2 = split2.mean(0)
for lev in range(level):
split1 = split1.stack()
split2 = split2.stack()
corrs.append(split1.corr(split2))
N = np.sum(split1.notnull())
corr = np.mean(corrs)
t = corr*np.sqrt((N-2)/(1-corr**2))
p = 1-scipy.stats.t.cdf(abs(t),N-2) # one-tailed
p = 2*p/(1.+p) # Spearman-Brown prediction for twice
# the amount of data
# we need this because we use only half
# of the data here
return corr, t, p
[docs]def mds(mean, ndim=2):
if mean.ndim != 2:
raise Exception('Similarity matrix for MDS must have exactly 2 '
'dimensions but only %d were found.' % mean.ndim)
res = classical_mds(np.array(mean), ndim=ndim)
if ndim <= 3:
columns = ['x', 'y', 'z']
else:
columns = ['x%d' % i for i in range(ndim)]
res = pandas.DataFrame(res, index=mean.index, columns=columns[:ndim])
res.columns.names = ['cols.mds']
return res
[docs]def classical_mds(d, ndim=2):
"""
Metric Unweighted Classical Multidimensional Scaling
Based on Forrest W. Young's notes on Torgerson's (1952) algorithm as
presented in http://forrest.psych.unc.edu/teaching/p230/Torgerson.pdf:
Step 0: Make data matrix symmetric with zeros on the diagonal
Step 1: Double center the data matrix (d) to obtain B by removing row
and column means and adding the grand mean of the squared data
Step 2: Solve B = U * L * U.T for U and L
Step 3: Calculate X = U * L**(-.5)
:Args:
- d: `numpy.ndarray`
A symmetric dissimilarity matrix
- ndim (int, default: 2)
The number of dimensions to project to
:Kwargs:
X[:, :ndim]: `numpy.ndarray`
The projection of d into ndim dimensions
:Returns:
A `numpy.array` with `ndim` columns representing the multidimensionally
scaled data.
"""
# Step 0
# make distances symmetric
d = (d + d.T) / 2
# diagonal must be 0
np.fill_diagonal(d, 0)
# Step 1: Double centering
# remove row and column mean and add grand mean of d**2
oo = d**2
rowmean = np.tile(np.mean(oo, 1), (oo.shape[1],1)).T
colmean = np.mean(oo, 0)
B = -.5 * (oo - rowmean - colmean + np.mean(oo))
# Step2: do singular value decomposition
# find U (eigenvectors) and L (eigenvalues)
[U, L, V] = np.linalg.svd(B) # L is already sorted (desceding)
# Step 3: X = U*L**(-.5)
X = U * np.sqrt(L)
return X[:,:ndim]
if __name__ == '__main__':
pass