"""Abstract class `Bouts` for Poisson mixture models
This module also provides useful functions for other modules subclassing
:class:`Bouts`.
"""
from abc import ABCMeta, abstractmethod
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from skdiveMove.helpers import rle_key
def nls_fun(x, coefs):
"""Random Poisson processes function
This is a generalized form taking any number of Poisson processes.
Parameters
----------
x : array_like
Independent data array described by the function
coefs : array_like
2-D array with coefficients ('a', 'lambda') in rows for each
process of the model in columns.
Returns
-------
out : array_like
Same shape as `x` with the evaluated function.
"""
def calc_term(params):
return(params[0] * params[1] * np.exp(-params[1] * x))
terms = np.apply_along_axis(calc_term, 0, coefs)
return(np.log(terms.sum(1)))
def calc_p(coefs):
"""Calculate `p` (proportion) parameter from `a` coefficients
Parameters
----------
coefs : pandas.DataFrame
DataFrame with model coefficients in columns, and indexed by
parameter names "a" and "lambda".
Returns
-------
p : list
Proportion parameters implied in `coef`.
lambdas : pandas.Series
A series with with the :math:`lambda` parameters from `coef`.
"""
ncoefs = coefs.shape[1]
coef_arr = np.arange(ncoefs)
pairs = [(i, i + 1) for i in coef_arr[:-1]]
p_ll = [] # build mixing ratios
for pair in pairs:
procn1 = coefs.columns[pair[0]] # name of process 1
procn2 = coefs.columns[pair[1]] # name of process 2
a1 = coefs.loc["a", procn1]
a2 = coefs.loc["a", procn2]
p_i = a1 / (a1 + a2)
p_ll.append(p_i)
return(p_ll, coefs.loc["lambda"])
def ecdf(x, p, lambdas):
"""Estimated cumulative frequency for two- or three-process models
Parameters
----------
x : array_like
Independent data array described by parameters `p` and `lambdas`.
p : list
List with mixing parameters of the model.
lambdas : pandas.Series
Series with the density parameters (:math:`lambda`) of the
model. Its length must be length(p) + 1.
Returns
-------
out : array_like
Same shape as `x` with the evaluated function.
"""
ncoefs = lambdas.size
# We assume at least two processes
p0 = p[0]
lda0 = lambdas.iloc[0]
term0 = 1 - p0 * np.exp(-lda0 * x)
if ncoefs == 2:
lda1 = lambdas.iloc[1]
term1 = (1 - p0) * np.exp(-lda1 * x)
cdf = term0 - term1
elif ncoefs == 3:
p1 = p[1]
lda1 = lambdas.iloc[1]
term1 = p1 * (1 - p0) * np.exp(-lda1 * x)
lda2 = lambdas.iloc[2]
term2 = (1 - p0) * (1 - p1) * np.exp(-lda2 * x)
cdf = term0 - term1 - term2
else:
msg = "Only mixtures of <= 3 processes are implemented"
raise KeyError(msg)
return(cdf)
[docs]def label_bouts(x, bec, as_diff=False):
"""Classify data into bouts based on bout ending criteria
Parameters
----------
x : pandas.Series
Series with data to classify according to `bec`.
bec : array_like
Array with bout-ending criteria. It is assumed to be sorted.
as_diff : bool, optional
Whether to apply `diff` on `x` so it matches `bec`'s scale.
Returns
-------
out : ndarray
Integer array with the same shape as `x`.
"""
if as_diff:
xx = x.diff().fillna(0)
else:
xx = x.copy()
xx_min = np.array(xx.min())
xx_max = np.array(xx.max())
brks = np.append(np.append(xx_min, bec), xx_max)
xx_cat = pd.cut(xx, bins=brks, include_lowest=True)
xx_bouts = rle_key(xx_cat)
return(xx_bouts)
[docs]class Bouts(metaclass=ABCMeta):
"""Abstract base class for models of log-transformed frequencies
This is a base class for other classes to build on, and do the model
fitting. `Bouts` is an abstract base class to set up bout
identification procedures. Subclasses must implement `fit` and `bec`
methods, or re-use the default NLS methods in `Bouts`.
Attributes
----------
x : array_like
1D array with input data.
method : str
Method used for calculating the histogram.
lnfreq : pandas.DataFrame
DataFrame with the centers of histogram bins, and corresponding
log-frequencies of `x`.
"""
def __init__(self, x, bw, method="standard"):
"""Histogram of log transformed frequencies of `x`
Parameters
----------
x : array_like
1D array with data where bouts will be identified based on
`method`.
bw : float
Bin width for the histogram
method : {"standard", "seq_diff"}, optional
Method to use for calculating the frequencies: "standard"
simply uses `x`, which "seq_diff" uses the sequential
differences method.
**kwargs : optional keywords
Passed to histogram
"""
self.x = x
self.method = method
if method == "standard":
upper = x.max()
brks = np.arange(x.min(), upper, bw)
if brks[-1] < upper:
brks = np.append(brks, brks[-1] + bw)
h, edges = np.histogram(x, bins=brks)
elif method == "seq_diff":
x_diff = x.diff().abs()
upper = x_diff.max()
brks = np.arange(0, upper, bw)
if brks[-1] < upper:
brks = np.append(brks, brks[-1] + bw)
h, edges = np.histogram(x_diff, bins=brks)
ctrs = edges[:-1] + np.diff(edges) / 2
ok = h > 0
ok_at = np.where(ok)[0] + 1 # 1-based indices
freq_adj = h[ok] / np.diff(np.insert(ok_at, 0, 0))
self.lnfreq = pd.DataFrame({"x": ctrs[ok],
"lnfreq": np.log(freq_adj)})
def __str__(self):
method = self.method
lnfreq = self.lnfreq
objcls = ("Class {} object\n".format(self.__class__.__name__))
meth_str = "{0:<20} {1}\n".format("histogram method: ", method)
lnfreq_str = ("{0:<20}\n{1}"
.format("log-frequency histogram:",
lnfreq.describe()))
return(objcls + meth_str + lnfreq_str)
[docs] def init_pars(self, x_break, plot=True, ax=None, **kwargs):
"""Find starting values for mixtures of random Poisson processes
Starting values are calculated using the "broken stick" method.
Parameters
----------
x_break : array_like
One- or two-element array with values determining the break(s)
for broken stick model, such that x < x_break[0] is first
process, x >= x_break[1] & x < x_break[2] is second process,
and x >= x_break[2] is third one.
plot : bool, optional
Whether to plot the broken stick model.
ax : matplotlib.Axes, optional
An Axes instance to use as target. Default is to create one.
**kwargs : optional keyword arguments
Passed to plotting function.
Returns
-------
out : pandas.DataFrame
DataFrame with coefficients for each process.
"""
nproc = len(x_break)
if (nproc > 2):
msg = "x_break must be length <= 2"
raise IndexError(msg)
lnfreq = self.lnfreq
ctrs = lnfreq["x"]
xmin = ctrs.min()
xmax = ctrs.max()
xbins = [xmin]
xbins.extend(x_break)
xbins.extend([xmax])
procf = pd.cut(ctrs, bins=xbins, right=True,
include_lowest=True)
lnfreq_grp = lnfreq.groupby(procf)
coefs_ll = []
for name, grp in lnfreq_grp:
fit = smf.ols("lnfreq ~ x", data=grp).fit()
coefs_ll.append(fit.params.rename(name))
coefs = pd.concat(coefs_ll, axis=1)
def calculate_pars(p):
"""Poisson process parameters from linear model
"""
lda = -p["x"]
a = np.exp(p["Intercept"]) / lda
return(pd.Series({"a": a, "lambda": lda}))
pars = coefs.apply(calculate_pars)
if plot:
if ax is None:
ax = plt.gca()
freq_min = lnfreq["lnfreq"].min()
freq_max = lnfreq["lnfreq"].max()
for name, grp in lnfreq_grp:
ax.scatter(x="x", y="lnfreq", data=grp, label=name)
# Plot current "stick"
coef_i = coefs[name]
y_stick = coef_i["Intercept"] + ctrs * coef_i["x"]
# Limit the "stick" line to min/max of data
ok = (y_stick >= freq_min) & (y_stick <= freq_max)
ax.plot(ctrs[ok], y_stick[ok], linestyle="--")
x_pred = np.linspace(xmin, xmax, num=101) # matches R's curve
y_pred = nls_fun(x_pred, pars)
ax.plot(x_pred, y_pred, alpha=0.5, label="model")
ax.legend(loc="upper right")
ax.set_xlabel("x")
ax.set_ylabel("log frequency")
return(pars)
[docs] @abstractmethod
def fit(self, start):
"""Fit Poisson mixture model to log frequencies
Default is non-linear least squares method.
Parameters
----------
start : pandas.DataFrame
DataFrame with coefficients for each process in columns.
Returns
-------
coefs : pandas.DataFrame
Coefficients of the model.
pcov : 2D array
Covariance of coefs.
"""
lnfreq = self.lnfreq
xdata = lnfreq["x"]
ydata = lnfreq["lnfreq"]
def _nls_fun(x, *args):
"""Wrapper to nls_fun to allow for array argument"""
# Pass in original shape, damn it! Note order="F" needed
coefs = np.array(args).reshape(start.shape, order="F")
return(nls_fun(x, coefs))
# Rearrange starting values into a 1D array (needs to be flat)
init_flat = start.to_numpy().T.reshape((start.size,))
popt, pcov = curve_fit(_nls_fun, xdata, ydata, p0=init_flat)
# Reshape coefs back into init shape
coefs = pd.DataFrame(popt.reshape(start.shape, order="F"),
columns=start.columns, index=start.index)
return(coefs, pcov)
[docs] @abstractmethod
def bec(self, coefs):
"""Calculate bout ending criteria from model coefficients
Implementing default as from NLS method.
Parameters
----------
coefs : pandas.DataFrame
DataFrame with model coefficients in columns, and indexed by
parameter names "a" and "lambda".
Returns
-------
out : ndarray, shape (n,)
1-D array with BECs implied by `coefs`. Length is
coefs.shape[1]
"""
# Find bec's per process by pairing columns
ncoefs = coefs.shape[1]
coef_arr = np.arange(ncoefs)
pairs = [(i, i + 1) for i in coef_arr[:-1]]
becs = []
for pair in pairs:
procn1 = coefs.columns[pair[0]] # name of process 1
procn2 = coefs.columns[pair[1]] # name of process 2
a1 = coefs.loc["a", procn1]
lambda1 = coefs.loc["lambda", procn1]
a2 = coefs.loc["a", procn2]
lambda2 = coefs.loc["lambda", procn2]
bec = (np.log((a1 * lambda1) / (a2 * lambda2)) /
(lambda1 - lambda2))
becs.append(bec)
return(np.array(becs))
[docs] def plot_fit(self, coefs, ax=None):
"""Plot log frequency histogram and fitted model
Parameters
----------
coefs : pandas.DataFrame
DataFrame with model coefficients in columns, and indexed by
parameter names "a" and "lambda".
ax : matplotlib.Axes instance
An Axes instance to use as target.
Returns
-------
ax : `matplotlib.Axes`
"""
lnfreq = self.lnfreq
ctrs = lnfreq["x"]
xmin = ctrs.min()
xmax = ctrs.max()
# BEC
becx = self.bec(coefs) # need an array for nls_fun
becy = nls_fun(becx, coefs)
x_pred = np.linspace(xmin, xmax, num=101) # matches R's curve
y_pred = nls_fun(x_pred, coefs)
if ax is None:
ax = plt.gca()
# Plot data
ax.scatter(x="x", y="lnfreq", data=lnfreq, label="histogram")
# Plot predicted
ax.plot(x_pred, y_pred, alpha=0.5, label="model")
# Plot BEC (note this plots all BECs in becx)
ylim = ax.get_ylim()
ax.vlines(becx, ylim[0], becy, linestyle="--")
# Annotations
if becx.size == 1:
xcrd = becx[0]
ax.annotate("bec = {0:.3f}".format(xcrd), (xcrd, ylim[0]),
xytext=(5, 0), textcoords="offset points")
ax.legend(loc=8, bbox_to_anchor=(0.5, 1), frameon=False,
borderaxespad=0.1, ncol=2)
ax.set_xlabel("x")
ax.set_ylabel("log frequency")
return(ax)