Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/state.py : 36%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# Stateful transform protocol:
6# def __init__(self):
7# pass
8# def memorize_chunk(self, input_data):
9# return None
10# def memorize_finish(self):
11# return None
12# def transform(self, input_data):
13# return output_data
15# BETTER WAY: always run the first row of data through the builder alone, and
16# check that it gives the same output row as when running the whole block of
17# data through at once. This gives us the same information, but it's robust
18# against people writing their own centering functions.
20# QUESTION: right now we refuse to even fit a model that contains a
21# my_transform(x)-style function. Maybe we should allow it to be fit (with a
22# warning), and only disallow making predictions with it? Need to revisit this
23# question once it's clearer what exactly our public API will look like,
24# because right now I'm not sure how to tell whether we are being called for
25# fitting versus being called for prediction.
27from functools import wraps
28import numpy as np
29from patsy.util import (atleast_2d_column_default,
30 asarray_or_pandas, pandas_friendly_reshape,
31 wide_dtype_for, safe_issubdtype,
32 no_pickling, assert_no_pickling)
34# These are made available in the patsy.* namespace
35__all__ = ["stateful_transform",
36 "center", "standardize", "scale",
37 ]
39def stateful_transform(class_):
40 """Create a stateful transform callable object from a class that fulfills
41 the :ref:`stateful transform protocol <stateful-transform-protocol>`.
42 """
43 @wraps(class_)
44 def stateful_transform_wrapper(*args, **kwargs):
45 transform = class_()
46 transform.memorize_chunk(*args, **kwargs)
47 transform.memorize_finish()
48 return transform.transform(*args, **kwargs)
49 stateful_transform_wrapper.__patsy_stateful_transform__ = class_
50 return stateful_transform_wrapper
52# class NonIncrementalStatefulTransform(object):
53# def __init__(self):
54# self._data = []
55#
56# def memorize_chunk(self, input_data, *args, **kwargs):
57# self._data.append(input_data)
58# self._args = _args
59# self._kwargs = kwargs
60#
61# def memorize_finish(self):
62# all_data = np.row_stack(self._data)
63# args = self._args
64# kwargs = self._kwargs
65# del self._data
66# del self._args
67# del self._kwargs
68# self.memorize_all(all_data, *args, **kwargs)
69#
70# def memorize_all(self, input_data, *args, **kwargs):
71# raise NotImplementedError
72#
73# def transform(self, input_data, *args, **kwargs):
74# raise NotImplementedError
75#
76# class QuantileEstimatingTransform(NonIncrementalStatefulTransform):
77# def memorize_all(self, input_data, *args, **kwargs):
79class Center(object):
80 """center(x)
82 A stateful transform that centers input data, i.e., subtracts the mean.
84 If input has multiple columns, centers each column separately.
86 Equivalent to ``standardize(x, rescale=False)``
87 """
88 def __init__(self):
89 self._sum = None
90 self._count = 0
92 def memorize_chunk(self, x):
93 x = atleast_2d_column_default(x)
94 self._count += x.shape[0]
95 this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
96 # This is to handle potentially multi-column x's:
97 if self._sum is None:
98 self._sum = this_total
99 else:
100 self._sum += this_total
102 def memorize_finish(self):
103 pass
105 def transform(self, x):
106 x = asarray_or_pandas(x)
107 # This doesn't copy data unless our input is a DataFrame that has
108 # heterogenous types. And in that case we're going to be munging the
109 # types anyway, so copying isn't a big deal.
110 x_arr = np.asarray(x)
111 if safe_issubdtype(x_arr.dtype, np.integer):
112 dt = float
113 else:
114 dt = x_arr.dtype
115 mean_val = np.asarray(self._sum / self._count, dtype=dt)
116 centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
117 return pandas_friendly_reshape(centered, x.shape)
119 __getstate__ = no_pickling
121center = stateful_transform(Center)
123# See:
124# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
125# or page 232 of Knuth vol. 3 (3rd ed.).
126class Standardize(object):
127 """standardize(x, center=True, rescale=True, ddof=0)
129 A stateful transform that standardizes input data, i.e. it subtracts the
130 mean and divides by the sample standard deviation.
132 Either centering or rescaling or both can be disabled by use of keyword
133 arguments. The `ddof` argument controls the delta degrees of freedom when
134 computing the standard deviation (cf. :func:`numpy.std`). The default of
135 ``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you
136 prefer the square root of the unbiased estimate of the variance.
138 If input has multiple columns, standardizes each column separately.
140 .. note:: This function computes the mean and standard deviation using a
141 memory-efficient online algorithm, making it suitable for use with
142 large incrementally processed data-sets.
143 """
144 def __init__(self):
145 self.current_n = 0
146 self.current_mean = None
147 self.current_M2 = None
149 def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
150 x = atleast_2d_column_default(x)
151 if self.current_mean is None:
152 self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
153 self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
154 # XX this can surely be vectorized but I am feeling lazy:
155 for i in range(x.shape[0]):
156 self.current_n += 1
157 delta = x[i, :] - self.current_mean
158 self.current_mean += delta / self.current_n
159 self.current_M2 += delta * (x[i, :] - self.current_mean)
161 def memorize_finish(self):
162 pass
164 def transform(self, x, center=True, rescale=True, ddof=0):
165 # XX: this forces all inputs to double-precision real, even if the
166 # input is single- or extended-precision or complex. But I got all
167 # tangled up in knots trying to do that without breaking something
168 # else (e.g. by requiring an extra copy).
169 x = asarray_or_pandas(x, copy=True, dtype=float)
170 x_2d = atleast_2d_column_default(x, preserve_pandas=True)
171 if center:
172 x_2d -= self.current_mean
173 if rescale:
174 x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
175 return pandas_friendly_reshape(x_2d, x.shape)
177 __getstate__ = no_pickling
179standardize = stateful_transform(Standardize)
180# R compatibility:
181scale = standardize