import numpy as np
from .experiment_data import ExperimentValues
[docs]
class SimulateCrookData:
"""
A class to represent simulated data as described by Crook et al. Data will
be generated in accordance to the parameters passed through :func:`vbvarsel.global_parameters.SimulationParameters`.
This only generates synthetic data, and as such is not used if a user
supplies their own data source. `Reference paper <https://www.degruyter.com/document/doi/10.1515/sagmb-2018-0065/html>`_
"""
# Attributes
# observation : int
# Number of observations to simulate.
# n_variables : int
# Number of variables to simulate.
# n_relevant : int
# Number of variables that are relevant.
# mixture_proportions : list[float]
# Proportion of observations in each cluster, length of the array defines
# number of simulated clusters.
# means : list[int]
# Mean of the Gaussian distribution for each cluster.
# variance_covariance_matrix : np.ndarray
# Matrix of variance and covariance for simulation.
def __init__(
self,
observation:int,
n_variables: int,
n_relevant: int,
mixture_proportions: list,
means: list,
variance_covariance_matrix: np.ndarray,
):
[docs]
self.observation = observation
[docs]
self.n_variables = n_variables
[docs]
self.n_relevant = n_relevant
[docs]
self.mixture_proportions = mixture_proportions
[docs]
self.variance_covariance_matrix = variance_covariance_matrix
[docs]
self.ExperimentValues = ExperimentValues()
[docs]
def relevant_vars(self) -> np.ndarray:
"""Returns array of relevant variables for use in simulation."""
samples = []
true_labels = [] # Store the true labels
for _ in range(self.observation):
# Select mixture component based on proportions
component = np.random.choice(list(range(len(self.mixture_proportions))), p=self.mixture_proportions)
true_labels.append(component) # Store the true label
mean_vector = np.full(self.n_relevant, self.means[component])
sample = np.random.multivariate_normal(
mean_vector, self.variance_covariance_matrix
)
samples.append(sample)
# Convert list of samples to numpy array
self.ExperimentValues.true_labels = true_labels
return np.array(samples)
[docs]
def irrelevant_vars(self) -> np.ndarray:
"""Returns array of irrelevant variables in simulation."""
n_irrelevant = self.n_variables - self.n_relevant
return np.random.randn(self.observation, n_irrelevant)
[docs]
def data_sim(self) -> np.ndarray:
"""Returns simulated data array."""
# Combine relevant and irrelevant variables
relevant_variables = self.relevant_vars()
irrelevant_variables = self.irrelevant_vars()
data = np.hstack((relevant_variables, irrelevant_variables))
self.ExperimentValues.data = data
return data
[docs]
def permutation(self) -> np.ndarray:
"""Returns permutations for simulation."""
permutations = np.random.permutation(self.n_variables)
self.ExperimentValues.permutations = permutations
return permutations
[docs]
def shuffle_sim_data(self, data, permutation) -> np.ndarray:
"""Shuffles randomised data for simulation.
Params
data: np.ndarray
Array of data generated from `self.data_sim()`
permutation: np.ndarray
Array of permutations generated from `self.permutations()`
"""
shuffled_data = data[:, permutation]
self.ExperimentValues.shuffled_data = shuffled_data