import os
import posixpath
import warnings
from itertools import groupby
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.special import gamma
from scipy.stats import cauchy, rankdata
from sklearn.utils import shuffle
from statsmodels.tsa.arima_process import ArmaProcess
[docs]
def GenDataMean(N, n, cp, mu, sigma):
"""
The function generates the data for change in mean with Gaussian noise.
When "cp" is None, it generates the data without change point.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
mu : float
the piecewise mean
sigma : float
the standard deviation of Gaussian distribution
Returns
-------
numpy array
2D array with size (N, n)
"""
if cp is None:
data = np.random.normal(mu[0], sigma, (N, n))
else:
data1 = np.random.normal(mu[0], sigma, (N, cp))
data2 = np.random.normal(mu[1], sigma, (N, n - cp))
data = np.concatenate((data1, data2), axis=1)
return data
[docs]
def GenDataMeanAR(N, n, cp, mu, sigma, coef):
"""
The function generates the data for change in mean with AR(1) noise.
When "cp" is None, it generates the data without change point.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
mu : float
the piecewise mean
sigma : float
the standard deviation of Gaussian innovations in AR(1) noise
coef : float scalar
the coefficients of AR(1) model
Returns
-------
numpy array
2D array with size (N, n)
"""
arparams = np.array([1, -coef])
maparams = np.array([1])
ar_process = ArmaProcess(arparams, maparams)
if cp is None:
data = mu[0] + np.transpose(
ar_process.generate_sample(nsample=(n, N), scale=sigma)
)
else:
noise = ar_process.generate_sample(nsample=(n, N), scale=sigma)
signal = np.repeat(mu, (cp, n - cp))
data = np.transpose(noise) + signal
return data
[docs]
def GenDataMeanARH(N, n, cp, mu, coef, scale):
"""
The function generates the data for change in mean + Cauchy noise with
location parameter 0 and scale parameter 'scale'. When "cp" is None, it
generates the data without change point.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
mu : float
the piecewise mean
coef : float array
the coefficients of AR(1) model
scale : the scale parameter of Cauchy distribution
the coefficients of AR(1) model
Returns
-------
numpy array
2D array with size (N, n)
"""
# initialize
n1 = n + 30
x_0 = np.ones((N,), dtype=np.float64)
eps_mat = cauchy.rvs(loc=0, scale=scale, size=(N, n1))
noise_mat = np.empty((N, n1))
for i in range(n1):
x_0 = coef * x_0 + eps_mat[:, i]
noise_mat[:, i] = x_0
if cp is None:
data = mu[0] + noise_mat[:, -n:]
else:
signal = np.repeat(mu, (cp, n - cp))
data = signal + noise_mat[:, -n:]
return data
[docs]
def GenDataMeanARrho(N, n, cp, mu, sigma):
"""
The function generates the data for change in mean with AR(1) noise. The
autoregressive coefficient is generated from standard uniform distribution.
When "cp" is None, it generates the data without change point.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
mu : float
the piecewise mean
sigma : float
the standard variance of normal distribution
Returns
-------
numpy array
2D array with size (N, n)
"""
# initialize
n1 = n + 30
x_0 = np.ones((N,), dtype=np.float64)
rho = np.random.uniform(0, 1, (N, n1))
eps_mat = np.random.normal(0, sigma, size=(N, n1))
noise_mat = np.empty((N, n1))
for i in range(n1):
x_0 = np.multiply(rho[:, i], x_0) + eps_mat[:, i]
noise_mat[:, i] = x_0
if cp is None:
data = mu[0] + noise_mat[:, -n:]
else:
signal = np.repeat(mu, (cp, n - cp))
data = signal + noise_mat[:, -n:]
return data
[docs]
def GenDataVariance(N, n, cp, mu, sigma):
"""
The function generates the data for change in variance with piecewise
constant signal. When "cp" is None, it generates the data without change
point in variance.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
mu : float
the piecewise mean
sigma : float
the standard deviation of Gaussian distribution
Returns
-------
numpy array
2D array with size (N, n)
"""
if cp is None:
data = np.random.normal(mu, sigma[0], (N, n))
else:
data1 = np.random.normal(mu, sigma[0], (N, cp))
data2 = np.random.normal(mu, sigma[1], (N, n - cp))
data = np.concatenate((data1, data2), axis=1)
return data
[docs]
def GenDataSlope(N, n, cp, slopes, sigma, start):
"""
The function generates the data for change in slope with Gaussian noise.
When "cp" is None, it generates the data without change point in slope.
Parameters
----------
N : int
the sample size
n : int
the length of time series
cp : int
the change point, only 1 change point is accepted in this function.
slopes : float
the slopes before and after the change point
sigma : float
the standard deviation of Gaussian distribution
start : float
the y-intercept of linear model
Returns
-------
numpy array
2D array with size (N, n)
"""
if cp is None:
y = start + slopes[0] * np.arange(n)
data = np.tile(y, (N, 1)) # repeat the row N times
else:
y1 = start + slopes[0] * np.arange(cp)
y2 = y1[-1] + slopes[1] * np.arange(1, n - cp + 1)
y = np.concatenate((y1, y2))
data = np.tile(y, (N, 1))
return data + np.random.normal(0, sigma, (N, n))
[docs]
def Standardize(data):
"""
Data standardization
Parameters
----------
data : numpy array
the data set with size (N, ..., n)
Returns
-------
data
standardized data
"""
data = data.transpose()
data = (data - data.mean(axis=0)) / data.std(axis=0)
return data.transpose()
[docs]
def labelTransition(data, label, ind, length, size, num_trim=100):
"""get the transition labels, change-points and time series from one subject
Parameters
----------
data : DataFrame
the time series.
label : DataFrame
the states of the subject
ind : scalar
the index of state
length : int
the length of extracted time series
size : int
the sample size
num_trim : int, optional
the number of observations to be trimmed before and after the change-point, by default 100
Returns
-------
dictionary
cp: the change-points; ts: time series; label: the transition labels.
"""
s = label["start"][ind : ind + 2]
e = label["end"][ind : ind + 2]
state = label["state"][ind : ind + 2]
new_label = state[ind] + "->" + state[ind + 1]
logical0 = (data["time"] >= s[ind]) & (data["time"] <= e[ind])
logical1 = (data["time"] >= s[ind + 1]) & (data["time"] <= e[ind + 1])
data_trim = data[logical0 | logical1]
len0 = sum(logical0)
len1 = sum(logical1)
ts_final = np.zeros((size, length, 3))
label_final = [new_label] * size
cp_final = np.zeros((size,))
result = extract(len0, len1, length, size, ntrim=num_trim)
cp_final = result["cp"].astype("int32")
sample = result["sample"].astype("int32")
for i in range(size):
ts_temp = data_trim.iloc[sample[i, :], 1:4]
ts_final[i, :, :] = ts_temp.to_numpy()
return {"cp": cp_final, "ts": ts_final, "label": label_final}
[docs]
def labelSubject(subject_path, length, size, num_trim=100):
"""
obtain the transition labels, change-points and time series from one subject.
Parameters
----------
subject_path : string
the path of subject data
length : int
the length of extracted time series
size : int
the sample size
num_trim : int, optional
the number of observations to be trimmed before and after the change-point, by default 100
Returns
-------
dictionary
cp: the change-points; ts: time series; label: the transition labels.
"""
# get the csv files
all_files = os.listdir(subject_path)
csv_files = list(filter(lambda f: f.endswith(".csv"), all_files))
csv_files = list(filter(lambda f: f.startswith("HASC"), csv_files))
for ind, fname in enumerate(csv_files):
print(ind)
print(fname)
fname_label = fname.replace("-acc.csv", ".label")
fname_label = posixpath.join(subject_path, fname_label)
fname = posixpath.join(subject_path, fname)
# load the labels
label_dataset = pd.read_csv(
fname_label, comment="#", delimiter=",", names=["start", "end", "state"]
)
num_consecutive_states = label_dataset.shape[0]
# load the dataset
dataset = pd.read_csv(
fname, comment="#", delimiter=",", names=["time", "x", "y", "z"]
)
if num_consecutive_states < 2:
warnings.warn(
"The length of times series exceeds the minimum length of two segments. Reduce length or increase the num_trim.",
DeprecationWarning,
)
for i in range(num_consecutive_states - 1):
result = labelTransition(
dataset, label_dataset, i, length, size, num_trim=num_trim
)
if i == 0:
ts = result["ts"]
label = result["label"]
cp = result["cp"]
else:
ts = np.concatenate([ts, result["ts"]], axis=0)
cp = np.concatenate([cp, result["cp"]])
label += result["label"]
if ind == 0:
ts_ind = ts
label_ind = label
cp_ind = cp
else:
ts_ind = np.concatenate([ts_ind, ts], axis=0)
cp_ind = np.concatenate([cp_ind, cp])
label_ind += label
return {"cp": cp_ind, "ts": ts_ind, "label": label_ind}
# functions for extracting null time series
[docs]
def DataGenAlternative(
N_sub,
B,
mu_L,
n,
B_bound,
ARcoef=0.0,
tau_bound=2,
ar_model="Gaussian",
scale=0.1,
sigma=1.0,
):
"""
This function genearates the simulation data from alternative model of change in mean.
Parameters
----------
N_sub : int
The sample size of simulation data.
B : float
The signal-to-noise ratio of parameter space.
mu_L : float
The single at the left of change point.
n : int
The length of time series.
B_bound : list, optional
The upper and lower bound scalars of signal-to-noise.
ARcoef : float, optional
The autoregressive parameter of AR(1) model, by default 0.0
tau_bound : int, optional
The lower bound of change point, by default 2
ar_model : str, optional
The different models, by default 'Gaussian'. ar_model="AR0" means AR(1)
noise with autoregressive parameter 'ARcoef'; ar_model="ARH" means
Cauchy noise with scale parameter 'scale'; ar_model="ARrho" means AR(1)
noise with random autoregressive parameter 'scale';
scale : float, optional
The scale parameter of Cauchy distribution, by default 0.1
sigma : float, optional
The standard variance of normal distribution, by default 1.0
Returns
-------
dict
data: size (N_sub,n);
tau_alt: size (N_sub,); the change points
mu_R: size (N_sub,); the single at the right of change point
"""
tau_all = np.random.randint(low=tau_bound, high=n - tau_bound, size=N_sub)
eta_all = tau_all / n
mu_R_abs_lower = B / np.sqrt(eta_all * (1 - eta_all))
# max_mu_R = np.max(mu_R_abs_lower)
sign_all = np.random.choice([-1, 1], size=N_sub)
mu_R_all = np.zeros((N_sub,))
data = np.zeros((N_sub, n))
for i in range(N_sub):
mu_R = mu_L - sign_all[i] * np.random.uniform(
low=B_bound[0] * mu_R_abs_lower[i],
high=B_bound[1] * mu_R_abs_lower[i],
size=1,
)
mu_R_all[i] = mu_R[0]
mu = np.array([mu_L, mu_R[0]], dtype=np.float32)
if ar_model == "Gaussian":
data[i, :] = GenDataMean(1, n, cp=tau_all[i], mu=mu, sigma=1)
elif ar_model == "AR0":
data[i, :] = GenDataMeanAR(1, n, cp=tau_all[i], mu=mu, sigma=1, coef=ARcoef)
elif ar_model == "ARH":
data[i, :] = GenDataMeanARH(
1, n, cp=tau_all[i], mu=mu, coef=ARcoef, scale=scale
)
elif ar_model == "ARrho":
data[i, :] = GenDataMeanARrho(1, n, cp=tau_all[i], mu=mu, sigma=sigma)
return {"data": data, "tau_alt": tau_all, "mu_R_alt": mu_R_all}
[docs]
def ComputeCUSUM(x):
"""
Compute the CUSUM statistics with O(n) time complexity
Parameters
----------
x : vector
the time series
Returns
-------
vector
a: the CUSUM statistics vector.
"""
n = len(x)
mean_left = x[0]
mean_right = np.mean(x[1:])
a = np.repeat(0.0, n - 1)
a[0,] = np.sqrt((n - 1) / n) * (mean_left - mean_right)
for i in range(1, n - 1):
mean_left = mean_left + (x[i] - mean_left) / (i + 1)
mean_right = mean_right + (mean_right - x[i]) / (n - i - 1)
a[i,] = np.sqrt((n - i - 1) * (i + 1) / n) * (mean_left - mean_right)
return a
[docs]
def MaxCUSUM(x):
"""
To return the maximum of CUSUM
Parameters
----------
x : vector
the time series
Returns
-------
scalar
the maximum of CUSUM
"""
y = np.abs(ComputeCUSUM(x))
return np.max(y)
[docs]
def ComputeMeanVarNorm(x, minseglen=2):
"""
Compute the likelihood for change in variance. Rewritten by the R function
single.var.norm.calc() in package changepoint.
Parameters
----------
x : numpy array
the time series
minseglen : int
the minimum length of segment
Returns
-------
scalar
the likelihood ratio
"""
n = len(x)
y = np.cumsum(x)
y2 = np.cumsum(x**2)
y = np.insert(y, 0, 0)
y2 = np.insert(y2, 0, 0)
null = n * np.log((y2[n] - y[n] ** 2 / n) / n)
taustar = np.arange(minseglen, n - minseglen + 2)
sigma1 = (y2[taustar] - y[taustar] ** 2 / taustar) / (taustar)
neg = sigma1 <= 0
sigma1[neg] = 1e-10
sigman = ((y2[n] - y2[taustar]) - (y[n] - y[taustar]) ** 2 / (n - taustar)) / (
n - taustar
)
neg = sigman <= 0
sigman[neg] = 1e-10
tmp = null - taustar * np.log(sigma1) - (n - taustar) * np.log(sigman)
return np.sqrt(np.max(tmp))
[docs]
def get_wilcoxon_test(x):
"""Compute the Wilcoxon statistics
Parameters
----------
x : array
the time series
Returns
-------
scalar
the maximum Wilcoxon statistics
"""
y = wilcoxon(x) / np.sqrt(get_asyvar_window(x))
return np.max(y)
[docs]
def wilcoxon(x):
"""This function implements the Wilcoxon cumulative sum statistic (Dehling
et al, 2013, Eq (20)) for nonparametric change point detection.
The following code is translated from the C function "wilcoxsukz" in R
package "robts". The accuracy of this function is already been tested.
Parameters
----------
x : array
time series
Returns
-------
1D array
the test statistic for each potential change point.
"""
n = len(x)
tn = np.repeat(0.0, n - 1)
for k in range(1, n):
tn_temp = 0
for i in range(0, k):
tn_temp = tn_temp + np.sum(x[k:n] > x[i]) - (n - k) / 2.0
tn[k - 1] = tn_temp * np.sqrt(k * (n - k)) / n * 2
return np.abs(tn / n ** (3 / 2))
[docs]
def get_asyvar_window(x, momentp=1):
"""This function computes the asymptotic variance of long run dependence
time series using "window" method. This function is translated from the R
function "asymvar.window". This function is already been tested by letting
"overlapping=F","obs="ranks".
Parameters
----------
x : 1D array
The time series
momentp : int, optional
which centred mean should be used, see Peligrad and Shao (1995) for
details, by default 1
Returns
-------
scalar
The asymptotic variance of time series.
"""
n = len(x)
x = rankdata(x) / n
l = np.int32(np.round((3 * n) ** (1 / 3) + 1))
phibar = np.mean(x)
k = np.int32(n // l)
xma = np.reshape(x[0 : (k * l)], (k, l), order="c")
s = np.sum(xma, axis=1)
s = (np.abs(s - l * phibar) / np.sqrt(l)) ** momentp
cp = 2 ** (-momentp / 2) * np.sqrt(np.pi) / gamma((momentp + 1) / 2)
er = np.sum(s) / k * cp
return er ** (2 / momentp)
[docs]
def DataGenScenarios(scenario, N, B, mu_L, n, B_bound, rho, tau_bound):
"""This function generates the data based on Scenarios 1, a and 3 in "Automatic Change-point Detection in Time Series via Deep Learning" (Jie et al. ,2023)
Parameters
----------
scenario : string
the scenario label: 'A0' is the Scenarios 1 with 'rho=0', 'A07' is the Scenarios 1 with 'rho=0.7', 'C' is the Scenarios 2 and 'D' is the Scenarios 3 with heavy tailed noise.
N : int
the sample size
B : float
The signal-to-noise ratio of parameter space.
mu_L : float
The single at the left of change point.
n : int
The length of time series.
B_bound : list, optional
The upper and lower bound scalars of signal-to-noise.
rho : scalar
the autocorrelation of AR(1) model
tau_bound : int, optional
The lower bound of change point, by default 2
Returns
-------
dict
data_all: the time series; y_all: the label array.
"""
np.random.seed(2022) # numpy seed fixing
if scenario == "A0":
result = DataGenAlternative(
N_sub=N,
B=B,
mu_L=mu_L,
n=n,
B_bound=B_bound,
ARcoef=rho,
tau_bound=tau_bound,
ar_model="Gaussian",
)
data_alt = result["data"]
# generate dataset for null hypothesis
data_null = GenDataMean(N, n, cp=None, mu=(mu_L, mu_L), sigma=1)
data_all = np.concatenate((data_alt, data_null), axis=0)
y_all = np.repeat((1, 0), N).reshape((2 * N, 1))
# generate the training dataset and test dataset
data_all, y_all = shuffle(data_all, y_all, random_state=42)
elif scenario == "A07":
rho = 0.7
result = DataGenAlternative(
N_sub=N,
B=B,
mu_L=mu_L,
n=n,
B_bound=B_bound,
ARcoef=rho,
tau_bound=tau_bound,
ar_model="AR0",
)
data_alt = result["data"]
# generate dataset for null hypothesis
data_null = GenDataMeanAR(N, n, cp=None, mu=(mu_L, mu_L), sigma=1, coef=rho)
data_all = np.concatenate((data_alt, data_null), axis=0)
y_all = np.repeat((1, 0), N).reshape((2 * N, 1))
# generate the training dataset and test dataset
data_all, y_all = shuffle(data_all, y_all, random_state=42)
elif scenario == "C":
scale = 0.3
result = DataGenAlternative(
N_sub=N,
B=B,
mu_L=mu_L,
n=n,
B_bound=B_bound,
ARcoef=rho,
tau_bound=tau_bound,
ar_model="ARH",
scale=scale,
)
data_alt = result["data"]
# generate dataset for null hypothesis
data_null = GenDataMeanARH(
N, n, cp=None, mu=(mu_L, mu_L), coef=rho, scale=scale
)
data_all = np.concatenate((data_alt, data_null), axis=0)
y_all = np.repeat((1, 0), N).reshape((2 * N, 1))
# generate the training dataset and test dataset
data_all, y_all = shuffle(data_all, y_all, random_state=42)
elif scenario == "D":
sigma = np.sqrt(2)
result = DataGenAlternative(
N_sub=N,
B=B,
mu_L=mu_L,
n=n,
B_bound=B_bound,
tau_bound=tau_bound,
ar_model="ARrho",
sigma=sigma,
)
data_alt = result["data"]
# generate dataset for null hypothesis
data_null = GenDataMeanARrho(N, n, cp=None, mu=(mu_L, mu_L), sigma=sigma)
data_all = np.concatenate((data_alt, data_null), axis=0)
y_all = np.repeat((1, 0), N).reshape((2 * N, 1))
# generate the training dataset and test dataset
data_all, y_all = shuffle(data_all, y_all, random_state=42)
return data_all, y_all
[docs]
def GenerateARAll(N, n, coef_left, coef_right, sigma, tau_bound):
"""
This function generates N the AR(1) signal
Parameters
----------
N : integer
The number of observations
n : integer
_description_
coef_left : float
The AR coefficient before the change-point
coef_right : float
The AR coefficient after the change-point
sigma : float
The standard deviation of noise
tau_bound : integer
The bound of change-point
Returns
-------
2D arrary and change-points
dataset with size (2*N, n), N change-points
"""
tau = np.random.randint(low=tau_bound, high=n - tau_bound, size=N)
data = np.zeros((2 * N, n))
for i in range(N):
data[i, :] = GenerateAR(n, coef_left, coef_right, None, sigma)
data[i + N, :] = GenerateAR(n, coef_left, coef_right, tau[i], sigma)
return data, tau
[docs]
def GenerateAR(n, coef_left, coef_right, tau, sigma):
"""This function generates the signal of AR(1) model
Parameters
----------
n : integer
The length of time series
coef_left : float
The AR coefficient before the change-point
coef_right : float
The AR coefficient after the change-point
tau : integer
The location of change-point
sigma : float
The standard deviation of noise
Returns
-------
array
The time series with length n.
"""
x = np.zeros((n + 1,))
if tau is None:
for i in range(n):
x[i + 1] = coef_left * x[i] + np.random.normal(0, sigma, 1)
else:
for i in range(n):
if i < tau:
x[i + 1] = coef_left * x[i] + np.random.normal(0, sigma, 1)
else:
x[i + 1] = coef_right * x[i] + np.random.normal(0, sigma, 1)
return x[1:]
[docs]
def get_cusum_location(x):
"""
This function return the estimation of change-point location based on CUSUM.
Parameters
----------
x : numpy array
The time series
Returns
-------
int
change-point location
"""
y = np.abs(ComputeCUSUM(x))
return np.argmax(y) + 1
[docs]
def ComputeMosum(x, G):
"""
Compute the mosum statistic, rewritten according to mosum.stat function in
mosum R package.
Parameters
----------
x : numpy array
The time series
G : scalar
the width of moving window
Returns
-------
int
the location of maximum mosum statistics
"""
n = len(x)
G = int(G)
sums_left = np.convolve(x, np.ones((G,)), "valid")
sums_right = sums_left
unscaled0 = np.repeat(np.nan, G - 1)
unscaled2 = np.repeat(np.nan, G)
unscaled1 = (sums_right[G:] - sums_left[0:-G]) / np.sqrt(2 * G)
unscaledStatistic = np.concatenate((unscaled0, unscaled1, unscaled2))
# MOSUM-based variance estimators
summedSquares_left = np.convolve(x**2, np.ones((G,)), "valid")
squaredSums_left = sums_left**2
var_tmp_left = summedSquares_left - 1 / G * squaredSums_left
var_left = np.concatenate((unscaled0, var_tmp_left)) / G
var_tmp_right = var_tmp_left
var_right = np.concatenate((var_tmp_right[1:], unscaled2)) / G
var = (var_right + var_left) / 2
weight_left = np.sqrt(2 * G / np.arange(1, G + 1) / np.arange(2 * G - 1, G - 1, -1))
unscaledStatistic[0:G] = np.cumsum(np.mean(x[0 : 2 * G]) - x[0:G]) * weight_left
var[0:G] = var[G - 1]
weight_right = np.sqrt(2 * G / np.arange(G - 1, 0, -1) / np.arange(G + 1, 2 * G))
x_rev = x[-2 * G :]
unscaledStatistic[n - G : n - 1] = (
np.cumsum(np.mean(x_rev) - x_rev)[-G:-1] * weight_right
)
unscaledStatistic[n - 1] = 0
var[-G:] = var[n - G - 1]
res = np.abs(unscaledStatistic) / np.sqrt(var)
return np.argmax(res) + 1
[docs]
def get_loc_3(model, x_test, n, width):
"""
This function obtains locations of methods: NN, double mosum based on
predicted label and probabilities.
Parameters
----------
model : model
The trained model
x_test : vector
The vector of time series
n : int
The length of x_test
width : int
The width of second moving window.
Returns
-------
array
3 locations.
"""
pred, prob = get_label(model, x_test, n)
loc_nn = get_mosum_loc_nn(pred, n)
loc_label = get_mosum_loc_double(pred, n, width, False)
loc_prob = get_mosum_loc_double(prob, n, width, True)
return np.array([loc_nn, loc_label, loc_prob])
[docs]
def get_label(model, x_test, n):
"""
This function gets the predicted label for the testing time series:x_test
Parameters
----------
model : tensorflow model
The trained tensorflow model
x_test : vector
The vector of time series
n : int
The width of moving window
Returns
-------
arrays
two arrays, one is predicted label, the other is probabilities.
"""
num_sliding_windows = len(x_test) - n + 1
x_test_mat = np.zeros((num_sliding_windows, n))
for i in range(num_sliding_windows):
x_test_mat[i, :] = x_test[i : i + n]
y_prob = model.predict(x_test_mat)
y_pred = np.argmax(y_prob, axis=1)
return y_pred, y_prob
[docs]
def get_mosum_loc_nn(pred, n):
"""
This function return the estimation of change-point based on MOSUM using NN.
Parameters
----------
pred : vector
The vector of predicted labels
n : int
The width of moving window
Returns
-------
int
change-point location
"""
num_each_group = []
state = []
# get the states of consecutive groups and their length.
for _, group in groupby(pred):
g = list(group)
state.append(g[0])
num_each_group.append(len(g))
state = np.asarray(state, dtype=np.int32)
num_each_group = np.asarray(num_each_group, dtype=np.int32)
# get the interval of longest consecutive 1 (alarm)
index = state == 1
num_group = len(num_each_group)
aind = np.arange(num_group)[index]
max_ind = np.argmax(num_each_group[index])
interval = np.cumsum(num_each_group)
a = interval[aind[max_ind] - 1] + 1
b = interval[aind[max_ind]]
return int(np.round((a + b + n) / 2))
[docs]
def get_mosum_loc_double(x, n, width, use_prob):
"""
This function return the estimation of change-point based on MOSUM by second moving average.
Parameters
----------
x : array
either the predicted labels or probabilities
n : int
The width of moving window
Returns
-------
int
change-point location
"""
if use_prob:
ma = np.convolve(x[:, 1], np.ones(width) / width, mode="valid")
else:
ma = np.convolve(x, np.ones(width) / width, mode="valid")
return np.argmax(ma) + int(np.round((width + n) / 2))
# %%
[docs]
def DataSetGen(N_sub, n, mean_arg, var_arg, slope_arg, n_trim, seed=2022):
"""
This function generates the simulation dataset for change in mean, in variance and change in non-zero slope. For more details, see Table S1 in supplement of "Automatic Change-point Detection in Time Series via Deep Learning" (Jie et al. ,2023)
Parameters
----------
N_sub : int
the sample size of each class
n : int
the length of time series
mean_arg : array
the hyperparameters for generating data of change in mean and null
var_arg : array
the hyperparameters for generating data of change in variance and null
slope_arg : array
the hyperparameters for generating data of change in slope and null
n_trim : int
the trim size
seed : int, optional
the random seed, by default 2022
Returns
-------
dictionary
the simulation data and corresponding changes
"""
np.random.seed(seed) # numpy seed fixing
sigma_1 = mean_arg[0] # the standard deviation of noise
upper_bound_mean = mean_arg[1]
lower_bound_mean = mean_arg[2]
upper_bound_abs_mean_difference = mean_arg[3]
lower_bound_abs_mean_difference = mean_arg[4]
mu_1all = np.zeros((N_sub, 2))
i = 0
while i < N_sub:
temp = np.random.uniform(lower_bound_mean, upper_bound_mean, 2)
abs_diff = np.abs(temp[1] - temp[0])
if (
abs_diff >= lower_bound_abs_mean_difference
and abs_diff <= upper_bound_abs_mean_difference
):
mu_1all[i, :] = temp
i = i + 1
mu_0all = np.column_stack((mu_1all[:, 0], np.repeat(0, N_sub)))
mu_para = np.concatenate((mu_0all, mu_1all))
n_range = np.arange(n_trim + 1, n - n_trim + 1)
cp_mean = np.random.choice(n_range, (N_sub,))
x_mean_0 = np.zeros((N_sub, n))
x_mean_1 = np.zeros((N_sub, n))
for i in range(N_sub):
mu_1 = mu_1all[i, :]
x_mean_0[i, :] = GenDataMean(1, n, cp=None, mu=mu_1, sigma=sigma_1)
x_mean_1[i, :] = GenDataMean(1, n, cp=cp_mean[i], mu=mu_1, sigma=sigma_1)
# one can manually change the parameters below to control the signal to noise ratio
mu_2 = var_arg[0]
upper_bound_std = var_arg[1]
lower_bound_std = var_arg[2]
upper_bound_abs_std_difference = var_arg[3]
lower_bound_abs_std_difference = var_arg[4]
sigma_2all = np.zeros((N_sub, 2))
i = 0
while i < N_sub:
temp = np.random.uniform(lower_bound_std, upper_bound_std, 2)
abs_diff = np.abs(temp[1] - temp[0])
if (
abs_diff >= lower_bound_abs_std_difference
and abs_diff <= upper_bound_abs_std_difference
):
sigma_2all[i, :] = temp
i = i + 1
sigma_0all = np.column_stack((sigma_2all[:, 0], np.repeat(0, N_sub)))
sigma_para = np.concatenate((sigma_0all, sigma_2all))
cp_var = np.random.choice(n_range, (N_sub,))
x_var_0 = np.zeros((N_sub, n))
x_var_1 = np.zeros((N_sub, n))
for i in range(N_sub):
sigma_2 = sigma_2all[i, :]
x_var_0[i, :] = GenDataVariance(1, n, cp=None, mu=mu_2, sigma=sigma_2)
x_var_1[i, :] = GenDataVariance(1, n, cp=cp_var[i], mu=mu_2, sigma=sigma_2)
# one can manually change the parameters below to control the signal to noise ratio
sigma_2 = slope_arg[0]
upper_bound_slope = slope_arg[1]
lower_bound_slope = slope_arg[2]
upper_bound_abs_slope_difference = slope_arg[3]
lower_bound_abs_slope_difference = slope_arg[4]
slopes_all = np.zeros((N_sub, 2))
i = 0
while i < N_sub:
temp = np.random.uniform(lower_bound_slope, upper_bound_slope, 2)
abs_diff = np.abs(temp[1] - temp[0])
if (
abs_diff >= lower_bound_abs_slope_difference
and abs_diff <= upper_bound_abs_slope_difference
):
slopes_all[i, :] = temp
i = i + 1
slopes_0all = np.column_stack((slopes_all[:, 0], np.repeat(0, N_sub)))
slopes_para = np.concatenate((slopes_0all, slopes_all))
cp_slope = np.random.choice(n_range, (N_sub,))
x_slope_0 = np.zeros((N_sub, n))
x_slope_1 = np.zeros((N_sub, n))
for i in range(N_sub):
slopes = slopes_all[i, :]
x_slope_0[i, :] = GenDataSlope(
1, n, cp=None, slopes=slopes, sigma=sigma_2, start=0
)
x_slope_1[i, :] = GenDataSlope(
1, n, cp=cp_slope[i], slopes=slopes, sigma=sigma_2, start=0
)
# concatenating
data_x = np.concatenate((x_mean_0, x_mean_1, x_var_1, x_slope_0, x_slope_1), axis=0)
return {
"data_x": data_x,
"cp_mean": cp_mean,
"cp_var": cp_var,
"cp_slope": cp_slope,
"mu_para": mu_para,
"sigma_para": sigma_para,
"slopes_para": slopes_para,
}
[docs]
def seqPlot(sequences_list, cp_list, label_list, y_pos=0.93):
"""
This function plots the sequence given change-points and label list.
Parameters
----------
sequences_list : DataFrame
the time series
cp_list : list
the list of change-point
label_list : list
the list of labels
y_pos : float, optional
the position of y, used in matplotlib, by default 0.93
"""
for seq, cp, label in zip(sequences_list, cp_list, label_list):
seq.reset_index(drop=True, inplace=True)
plt.figure()
axes = seq.plot(y=["x", "y", "z"], figsize=(15, 6))
axes.vlines(cp[0:-1], 0, 1, transform=axes.get_xaxis_transform(), colors="r")
xlim = axes.get_xlim()
cp = np.insert(cp, 0, xlim[0])
x_range = np.diff(xlim)
for i in range(len(label)):
str_state = label["state"][i]
if i == 0:
x_pos = (np.mean(cp[i : i + 2]) - xlim[0]) / x_range
else:
x_pos = (np.mean(cp[i : i + 2]) - xlim[0] / 2) / x_range
axes.text(x_pos, y_pos, str_state, transform=axes.transAxes)
[docs]
def get_key(y_pred, label_dict):
"""
To get the labels according to the predict value
Parameters
----------
y_pred : int
the value of prediction
label_dict : dict
the lable dictionary
Returns
-------
list
the label list
"""
label_str = list()
for value in y_pred:
key = [key for key, val in label_dict.items() if val == value]
label_str.append(key[0])
return label_str
[docs]
def get_label_hasc(model, x_test, label_dict):
"""
This function gets the predicted label for the HASC data
Parameters
----------
model : tensorflow model
The trained tensorflow model
x_test : 2D array
The array of test dataset
label_dict : dict
The label dictionary
Returns
-------
arrays
two arrays, one is predicted label, the other is probabilities.
"""
pred_prob = tf.math.softmax(model.predict(x_test))
y_pred = np.argmax(pred_prob, axis=1)
label_str = get_key(y_pred, label_dict)
return label_str, pred_prob.numpy()