Module redvox.common.gap_and_pad_utils
Expand source code
from typing import List, Tuple, Optional, Dict
import enum
from math import modf
from dataclasses import dataclass, field
from dataclasses_json import dataclass_json
import numpy as np
import pyarrow as pa
import pyarrow.compute as pc
from redvox.common import date_time_utils as dtu
from redvox.common.errors import RedVoxExceptions
from redvox.api1000.wrapped_redvox_packet.sensors.audio import AudioCodec
from redvox.api1000.wrapped_redvox_packet.sensors.location import LocationProvider
from redvox.api1000.wrapped_redvox_packet.sensors.image import ImageCodec
from redvox.api1000.wrapped_redvox_packet.station_information import \
NetworkType, PowerState, CellServiceState, WifiWakeLock, ScreenState
# default maximum number of points required to brute force calculating gap timestamps
DEFAULT_MAX_BRUTE_FORCE_GAP_TIMESTAMPS: int = 5000
# percent of packet duration/sample rate required for gap to be considered a whole unit
DEFAULT_GAP_UPPER_LIMIT: float = 0.8
# percent of packet duration/sample rate required for gap to be considered nothing
DEFAULT_GAP_LOWER_LIMIT: float = 0.02
# columns for audio table
AUDIO_DF_COLUMNS = ["timestamps", "unaltered_timestamps", "microphone"]
# columns that cannot be interpolated
NON_INTERPOLATED_COLUMNS = ["compressed_audio", "image"]
# columns that are not numeric but can be interpolated
NON_NUMERIC_COLUMNS = ["location_provider", "image_codec", "audio_codec", "network_type",
"power_state", "cell_service", "wifi_wake_lock", "screen_state"]
# noinspection Mypy,DuplicatedCode
class DataPointCreationMode(enum.Enum):
"""
Type of data point to create
"""
NAN: int = 0
COPY: int = 1
INTERPOLATE: int = 2
@staticmethod
def list_names() -> List[str]:
return [n.name for n in DataPointCreationMode]
@dataclass_json
@dataclass
class GapPadResult:
"""
The result of filling gaps or padding a time series
"""
result: Optional[pa.Table] = None
gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("GapPadResult"))
def add_error(self, error: str):
"""
add an error to the result
:param error: error message to add
"""
self.errors.append(error)
@dataclass_json
@dataclass
class AudioWithGaps:
"""
Represents methods of reconstructing audio data with or without gaps in it
Properties:
sample_interval_micros: microseconds between sample points
metadata: list of start times in microseconds since epoch UTC and the data to add
gaps: the list of start and end points of gaps (the start and end are actual data points)
errors: the errors encountered while getting the data
"""
sample_interval_micros: float
metadata: Optional[List[Tuple[float, pa.Table]]] = None
gaps: List[Tuple[float, float]] = field(default_factory=lambda: [])
errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("AudioWithGaps"))
def create_timestamps(self) -> pa.Table:
"""
:return: converts the audio metadata into a data table
"""
result_array = [[], [], []]
for m in self.metadata:
timestamps = calc_evenly_sampled_timestamps(m[0], m[1].num_rows, self.sample_interval_micros)
result_array[0].extend(timestamps)
result_array[1].extend(timestamps)
result_array[2].extend(m[1]["microphone"].to_numpy())
for gs, ge in self.gaps:
num_samples = int((ge - gs) / self.sample_interval_micros) - 1
timestamps = calc_evenly_sampled_timestamps(gs + self.sample_interval_micros, num_samples,
self.sample_interval_micros)
gap_array = [timestamps, np.full(len(timestamps), np.nan)]
result_array[0].extend(gap_array[0])
result_array[1].extend(gap_array[0])
result_array[2].extend(gap_array[1])
ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array)))
return pc.take(ptable, pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
def calc_evenly_sampled_timestamps(
start: float, samples: int, sample_interval_micros: float
) -> np.array:
"""
given a start time, calculates samples amount of evenly spaced timestamps at rate_hz
:param start: float, start timestamp in microseconds
:param samples: int, number of samples
:param sample_interval_micros: float, sample interval in microseconds
:return: np.array with number of samples timestamps, evenly spaced starting at start
"""
return start + (np.arange(0, samples) * sample_interval_micros)
def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None,
end_timestamp: float = None) -> List[Tuple[float, float]]:
"""
removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap
starts or ends before start_timestamp and starts or ends after end_timestamp. All timestamps are in
microseconds since epoch UTC
:param gaps: list of gaps to check
:param start_timestamp: lowest possible timestamp for a gap to start at
:param end_timestamp: lowest possible timestamp for a gap to end at
:return: list of correct, valid gaps
"""
return_gaps: List[Tuple[float, float]] = []
for gap in gaps:
if start_timestamp:
gap = (np.max([start_timestamp, gap[0]]), np.max([start_timestamp, gap[1]]))
if end_timestamp:
gap = (np.min([end_timestamp, gap[0]]), np.min([end_timestamp, gap[1]]))
if gap[0] < gap[1]:
if len(return_gaps) < 1:
return_gaps.append(gap)
for a, r_g in enumerate(return_gaps):
if (gap[0] < r_g[0] and gap[1] < r_g[0]) or (gap[0] > r_g[1] and gap[1] > r_g[1]):
return_gaps.append(gap)
break
else:
if gap[0] < r_g[0] < gap[1]:
r_g = (gap[0], r_g[1])
if gap[0] < r_g[1] < gap[1]:
r_g = (r_g[0], gap[1])
return_gaps[a] = r_g
return return_gaps
def fill_gaps(
arrow_df: pa.Table,
gaps: List[Tuple[float, float]],
sample_interval_micros: float,
copy: bool = False
) -> Tuple[pa.Table, List[Tuple[float, float]]]:
"""
fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the
calculated sample interval
:param arrow_df: pyarrow table with data. first column is "timestamps"
:param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps
:param sample_interval_micros: known sample interval of the data points
:param copy: if True, copy the data points, otherwise interpolate from edges, default False
:return: table without gaps and the list of gaps
"""
# extract the necessary information to compute gap size and gap timestamps
data_time_stamps = arrow_df["timestamps"].to_numpy()
if len(data_time_stamps) > 1:
data_duration = data_time_stamps[-1] - data_time_stamps[0]
expected_samples = (np.floor(data_duration / sample_interval_micros)
+ (1 if data_duration % sample_interval_micros >=
sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1
if expected_samples > len(data_time_stamps):
if copy:
pcm = DataPointCreationMode["COPY"]
else:
pcm = DataPointCreationMode["NAN"]
# make it safe to alter the gap values
my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1])
for gap in my_gaps:
# if timestamps are around gaps, we have to update the values
before_start = np.argwhere([t <= gap[0] for t in data_time_stamps])
after_end = np.argwhere([t >= gap[1] for t in data_time_stamps])
if len(before_start) > 0:
before_start = before_start[-1][0]
# sim = gap[0] - data_time_stamps[before_start]
# result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm)
gap = (data_time_stamps[before_start], gap[1])
else:
before_start = None
if len(after_end) > 0:
after_end = after_end[0][0]
# sim = gap[1] - data_time_stamps[after_end]
gap = (gap[0], data_time_stamps[after_end])
else:
after_end = None
num_new_points = int((gap[1] - gap[0]) / sample_interval_micros) - 1
if before_start is not None:
arrow_df = add_data_points_to_df(arrow_df, before_start, sample_interval_micros,
num_new_points, pcm)
elif after_end is not None:
arrow_df = add_data_points_to_df(arrow_df, after_end, -sample_interval_micros,
num_new_points, pcm)
indic = pc.sort_indices(arrow_df, sort_keys=[("timestamps", "ascending")])
return arrow_df.take(indic), gaps
return arrow_df, gaps
def fill_audio_gaps2(
packet_data: List[Tuple[float, pa.Table]],
sample_interval_micros: float,
gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT,
gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT
) -> AudioWithGaps:
"""
fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval
* ignores gaps with duration less than or equal to packet length * gap_lower_limit
* converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of
packet length
:param packet_data: list of tuples, each tuple containing two pieces of packet information:
packet_start_timestamps; float of packet start timestamp in microseconds
and audio_data; pa.Table of data points
:param sample_interval_micros: sample interval in microseconds
:param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet,
default DEFAULT_GAP_UPPER_LIMIT
:param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT
:return: list of timestamps of the non-inclusive start and end of the gaps
"""
last_data_timestamp = packet_data[0][0]
gaps = []
result = AudioWithGaps(sample_interval_micros, packet_data)
for packet in packet_data:
samples_in_packet = packet[1].num_rows
start_ts = packet[0]
packet_length = sample_interval_micros * samples_in_packet
# check if start_ts is close to the last timestamp in data_timestamps
last_timestamp_diff = start_ts - last_data_timestamp
if last_timestamp_diff > gap_lower_limit * packet_length:
fractional_packet, num_packets = modf(last_timestamp_diff /
(samples_in_packet * sample_interval_micros))
if fractional_packet >= gap_upper_limit:
num_samples = samples_in_packet * (num_packets + 1)
else:
num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1])
start_ts = last_data_timestamp + (num_samples * sample_interval_micros) + sample_interval_micros
gaps.append((last_data_timestamp, start_ts))
elif last_timestamp_diff < -gap_lower_limit * packet_length:
result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} "
f"is before last timestamp of previous "
f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}")
last_data_timestamp += (samples_in_packet + 1) * sample_interval_micros
result.gaps = gaps
return result
def fill_audio_gaps(
packet_data: List[Tuple[float, pa.Table]],
sample_interval_micros: float,
gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT,
gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT
) -> GapPadResult:
"""
fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval
* ignores gaps with duration less than or equal to packet length * gap_lower_limit
* converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of
packet length
:param packet_data: list of tuples, each tuple containing two pieces of packet information:
packet_start_timestamps; float of packet start timestamp in microseconds
and audio_data; pa.Table of data points
:param sample_interval_micros: sample interval in microseconds
:param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet,
default DEFAULT_GAP_UPPER_LIMIT
:param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT
:return: table without gaps and the list of timestamps of the non-inclusive start and end of the gaps
"""
result_array = [[], [], []]
last_data_timestamp: Optional[float] = None
gaps = []
result = GapPadResult()
for packet in packet_data:
samples_in_packet = packet[1].num_rows
start_ts = packet[0]
packet_length = sample_interval_micros * samples_in_packet
if last_data_timestamp:
last_data_timestamp += sample_interval_micros
# check if start_ts is close to the last timestamp in data_timestamps
last_timestamp_diff = start_ts - last_data_timestamp
if last_timestamp_diff > gap_lower_limit * packet_length:
fractional_packet, num_packets = modf(last_timestamp_diff /
(samples_in_packet * sample_interval_micros))
if fractional_packet >= gap_upper_limit:
num_samples = samples_in_packet * (num_packets + 1)
else:
num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1])
gap_ts = calc_evenly_sampled_timestamps(last_data_timestamp, num_samples, sample_interval_micros)
gap_array = [gap_ts, np.full(len(gap_ts), np.nan)]
start_ts = gap_ts[-1] + sample_interval_micros
gaps.append((last_data_timestamp, start_ts))
result_array[0].extend(gap_array[0])
result_array[1].extend(gap_array[0])
result_array[2].extend(gap_array[1])
elif last_timestamp_diff < -gap_lower_limit * packet_length:
result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} "
f"is before last timestamp of previous "
f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}")
# return result
estimated_ts = calc_evenly_sampled_timestamps(start_ts, samples_in_packet, sample_interval_micros)
last_data_timestamp = estimated_ts[-1]
result_array[0].extend(estimated_ts)
result_array[1].extend(estimated_ts)
result_array[2].extend(packet[1]["microphone"].to_numpy())
result.result = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array)))
result.gaps = gaps
return result
def add_data_points_to_df(data_table: pa.Table,
start_index: int,
sample_interval_micros: float,
num_samples_to_add: int = 1,
point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY,
) -> pa.Table:
"""
adds data points to the end of the table, starting from the index specified.
Note:
* table must not be empty
* start_index must be non-negative and less than the length of table
* num_samples_to_add must be greater than 0
* sample_interval_micros cannot be 0
* points are added onto the end and the result is not sorted
Options for point_creation_mode are:
* NAN: default values and nans
* COPY: copies of the start data point
* INTERPOLATE: interpolated values between start data point and adjacent point
:param data_table: pyarrow table to add dataless timestamps to
:param start_index: index of the table to use as starting point for creating new values
:param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to
add points before the start_index
:param num_samples_to_add: the number of timestamps to create, default 1
:param point_creation_mode: the mode of point creation to use
:return: updated table with synthetic data points
"""
if len(data_table) > start_index and len(data_table) > 0 and num_samples_to_add > 0 \
and sample_interval_micros != 0.:
start_timestamp = data_table["timestamps"][start_index].as_py()
# create timestamps for every point that needs to be added
new_timestamps = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros
if point_creation_mode == DataPointCreationMode.COPY:
# copy the start point
copy_row = data_table.slice(start_index, 1).to_pydict()
for t in new_timestamps:
copy_row["timestamps"] = [t]
# for k in copy_row.keys():
# new_dict[k].append(copy_row[k])
empty_df = pa.Table.from_pydict(copy_row)
elif point_creation_mode == DataPointCreationMode.INTERPOLATE:
# use the start point and the next point as the edges for interpolation
start_point = data_table.slice(start_index, 1).to_pydict()
numeric_start = start_point[[col for col in data_table.schema.names
if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
non_numeric_start = start_point[[col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS]]
end_point = data_table.slice(start_index + (1 if sample_interval_micros > 0 else -1), 1).to_pydict()
numeric_end = end_point[[col for col in data_table.schema.names
if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]]
non_numeric_end = end_point[[col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS]]
if np.abs(start_point["timestamps"] - new_timestamps[0]) \
<= np.abs(end_point["timestamps"] - new_timestamps[0]):
non_numeric_diff = non_numeric_start
else:
non_numeric_diff = non_numeric_end
numeric_diff = numeric_end - numeric_start
numeric_diff = \
(numeric_diff / numeric_diff["timestamps"]) * \
(new_timestamps - numeric_start) + numeric_start
# merge dicts (python 3.5 to 3.8)
empty_df = pa.Table.from_pydict({**numeric_diff, **non_numeric_diff})
# merge dicts (python 3.9):
# empty_df = pa.Table.from_pydict(numeric_diff | non_numeric_diff)
else:
# add nans and defaults
empty_dict: Dict[str, List] = {}
for k in data_table.schema.names:
empty_dict[k] = []
for column_index in data_table.schema.names:
if column_index == "timestamps":
empty_dict[column_index] = new_timestamps
elif column_index == "location_provider":
empty_dict[column_index] = [LocationProvider["UNKNOWN"].value for i in range(num_samples_to_add)]
elif column_index == "image_codec":
empty_dict[column_index] = [ImageCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
elif column_index == "audio_codec":
empty_dict[column_index] = [AudioCodec["UNKNOWN"].value for i in range(num_samples_to_add)]
elif column_index == "network_type":
empty_dict[column_index] = [NetworkType["UNKNOWN_NETWORK"].value for i in range(num_samples_to_add)]
elif column_index == "power_state":
empty_dict[column_index] = [PowerState["UNKNOWN_POWER_STATE"].value
for i in range(num_samples_to_add)]
elif column_index == "cell_service":
empty_dict[column_index] = [CellServiceState["UNKNOWN"].value for i in range(num_samples_to_add)]
elif column_index == "wifi_wake_lock":
empty_dict[column_index] = [WifiWakeLock["NONE"].value for i in range(num_samples_to_add)]
elif column_index == "screen_state":
empty_dict[column_index] = [ScreenState["UNKNOWN_SCREEN_STATE"].value
for i in range(num_samples_to_add)]
else:
empty_dict[column_index] = np.full(num_samples_to_add, np.nan).tolist()
empty_df = pa.Table.from_pydict(empty_dict)
data_table = pa.concat_tables([data_table, empty_df])
return data_table
Functions
def add_data_points_to_df(data_table: pyarrow.lib.Table, start_index: int, sample_interval_micros: float, num_samples_to_add: int = 1, point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY) ‑> pyarrow.lib.Table
-
adds data points to the end of the table, starting from the index specified. Note: * table must not be empty * start_index must be non-negative and less than the length of table * num_samples_to_add must be greater than 0 * sample_interval_micros cannot be 0 * points are added onto the end and the result is not sorted Options for point_creation_mode are: * NAN: default values and nans * COPY: copies of the start data point * INTERPOLATE: interpolated values between start data point and adjacent point
:param data_table: pyarrow table to add dataless timestamps to :param start_index: index of the table to use as starting point for creating new values :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to add points before the start_index :param num_samples_to_add: the number of timestamps to create, default 1 :param point_creation_mode: the mode of point creation to use :return: updated table with synthetic data points
Expand source code
def add_data_points_to_df(data_table: pa.Table, start_index: int, sample_interval_micros: float, num_samples_to_add: int = 1, point_creation_mode: DataPointCreationMode = DataPointCreationMode.COPY, ) -> pa.Table: """ adds data points to the end of the table, starting from the index specified. Note: * table must not be empty * start_index must be non-negative and less than the length of table * num_samples_to_add must be greater than 0 * sample_interval_micros cannot be 0 * points are added onto the end and the result is not sorted Options for point_creation_mode are: * NAN: default values and nans * COPY: copies of the start data point * INTERPOLATE: interpolated values between start data point and adjacent point :param data_table: pyarrow table to add dataless timestamps to :param start_index: index of the table to use as starting point for creating new values :param sample_interval_micros: sample interval in microseconds of the timestamps; use negative values to add points before the start_index :param num_samples_to_add: the number of timestamps to create, default 1 :param point_creation_mode: the mode of point creation to use :return: updated table with synthetic data points """ if len(data_table) > start_index and len(data_table) > 0 and num_samples_to_add > 0 \ and sample_interval_micros != 0.: start_timestamp = data_table["timestamps"][start_index].as_py() # create timestamps for every point that needs to be added new_timestamps = start_timestamp + np.arange(1, num_samples_to_add + 1) * sample_interval_micros if point_creation_mode == DataPointCreationMode.COPY: # copy the start point copy_row = data_table.slice(start_index, 1).to_pydict() for t in new_timestamps: copy_row["timestamps"] = [t] # for k in copy_row.keys(): # new_dict[k].append(copy_row[k]) empty_df = pa.Table.from_pydict(copy_row) elif point_creation_mode == DataPointCreationMode.INTERPOLATE: # use the start point and the next point as the edges for interpolation start_point = data_table.slice(start_index, 1).to_pydict() numeric_start = start_point[[col for col in data_table.schema.names if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]] non_numeric_start = start_point[[col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS]] end_point = data_table.slice(start_index + (1 if sample_interval_micros > 0 else -1), 1).to_pydict() numeric_end = end_point[[col for col in data_table.schema.names if col not in NON_INTERPOLATED_COLUMNS + NON_NUMERIC_COLUMNS]] non_numeric_end = end_point[[col for col in data_table.schema.names if col in NON_NUMERIC_COLUMNS]] if np.abs(start_point["timestamps"] - new_timestamps[0]) \ <= np.abs(end_point["timestamps"] - new_timestamps[0]): non_numeric_diff = non_numeric_start else: non_numeric_diff = non_numeric_end numeric_diff = numeric_end - numeric_start numeric_diff = \ (numeric_diff / numeric_diff["timestamps"]) * \ (new_timestamps - numeric_start) + numeric_start # merge dicts (python 3.5 to 3.8) empty_df = pa.Table.from_pydict({**numeric_diff, **non_numeric_diff}) # merge dicts (python 3.9): # empty_df = pa.Table.from_pydict(numeric_diff | non_numeric_diff) else: # add nans and defaults empty_dict: Dict[str, List] = {} for k in data_table.schema.names: empty_dict[k] = [] for column_index in data_table.schema.names: if column_index == "timestamps": empty_dict[column_index] = new_timestamps elif column_index == "location_provider": empty_dict[column_index] = [LocationProvider["UNKNOWN"].value for i in range(num_samples_to_add)] elif column_index == "image_codec": empty_dict[column_index] = [ImageCodec["UNKNOWN"].value for i in range(num_samples_to_add)] elif column_index == "audio_codec": empty_dict[column_index] = [AudioCodec["UNKNOWN"].value for i in range(num_samples_to_add)] elif column_index == "network_type": empty_dict[column_index] = [NetworkType["UNKNOWN_NETWORK"].value for i in range(num_samples_to_add)] elif column_index == "power_state": empty_dict[column_index] = [PowerState["UNKNOWN_POWER_STATE"].value for i in range(num_samples_to_add)] elif column_index == "cell_service": empty_dict[column_index] = [CellServiceState["UNKNOWN"].value for i in range(num_samples_to_add)] elif column_index == "wifi_wake_lock": empty_dict[column_index] = [WifiWakeLock["NONE"].value for i in range(num_samples_to_add)] elif column_index == "screen_state": empty_dict[column_index] = [ScreenState["UNKNOWN_SCREEN_STATE"].value for i in range(num_samples_to_add)] else: empty_dict[column_index] = np.full(num_samples_to_add, np.nan).tolist() empty_df = pa.Table.from_pydict(empty_dict) data_table = pa.concat_tables([data_table, empty_df]) return data_table
def calc_evenly_sampled_timestamps(start: float, samples: int, sample_interval_micros: float) ‑>
-
given a start time, calculates samples amount of evenly spaced timestamps at rate_hz
:param start: float, start timestamp in microseconds :param samples: int, number of samples :param sample_interval_micros: float, sample interval in microseconds :return: np.array with number of samples timestamps, evenly spaced starting at start
Expand source code
def calc_evenly_sampled_timestamps( start: float, samples: int, sample_interval_micros: float ) -> np.array: """ given a start time, calculates samples amount of evenly spaced timestamps at rate_hz :param start: float, start timestamp in microseconds :param samples: int, number of samples :param sample_interval_micros: float, sample interval in microseconds :return: np.array with number of samples timestamps, evenly spaced starting at start """ return start + (np.arange(0, samples) * sample_interval_micros)
def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None, end_timestamp: float = None) ‑> List[Tuple[float, float]]
-
removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap starts or ends before start_timestamp and starts or ends after end_timestamp. All timestamps are in microseconds since epoch UTC
:param gaps: list of gaps to check :param start_timestamp: lowest possible timestamp for a gap to start at :param end_timestamp: lowest possible timestamp for a gap to end at :return: list of correct, valid gaps
Expand source code
def check_gap_list(gaps: List[Tuple[float, float]], start_timestamp: float = None, end_timestamp: float = None) -> List[Tuple[float, float]]: """ removes any gaps where end time <= start time, consolidates overlapping gaps, and ensures that no gap starts or ends before start_timestamp and starts or ends after end_timestamp. All timestamps are in microseconds since epoch UTC :param gaps: list of gaps to check :param start_timestamp: lowest possible timestamp for a gap to start at :param end_timestamp: lowest possible timestamp for a gap to end at :return: list of correct, valid gaps """ return_gaps: List[Tuple[float, float]] = [] for gap in gaps: if start_timestamp: gap = (np.max([start_timestamp, gap[0]]), np.max([start_timestamp, gap[1]])) if end_timestamp: gap = (np.min([end_timestamp, gap[0]]), np.min([end_timestamp, gap[1]])) if gap[0] < gap[1]: if len(return_gaps) < 1: return_gaps.append(gap) for a, r_g in enumerate(return_gaps): if (gap[0] < r_g[0] and gap[1] < r_g[0]) or (gap[0] > r_g[1] and gap[1] > r_g[1]): return_gaps.append(gap) break else: if gap[0] < r_g[0] < gap[1]: r_g = (gap[0], r_g[1]) if gap[0] < r_g[1] < gap[1]: r_g = (r_g[0], gap[1]) return_gaps[a] = r_g return return_gaps
def fill_audio_gaps(packet_data: List[Tuple[float, pyarrow.lib.Table]], sample_interval_micros: float, gap_upper_limit: float = 0.8, gap_lower_limit: float = 0.02) ‑> GapPadResult
-
fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval * ignores gaps with duration less than or equal to packet length * gap_lower_limit * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of packet length
:param packet_data: list of tuples, each tuple containing two pieces of packet information: packet_start_timestamps; float of packet start timestamp in microseconds and audio_data; pa.Table of data points :param sample_interval_micros: sample interval in microseconds :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet, default DEFAULT_GAP_UPPER_LIMIT :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT :return: table without gaps and the list of timestamps of the non-inclusive start and end of the gaps
Expand source code
def fill_audio_gaps( packet_data: List[Tuple[float, pa.Table]], sample_interval_micros: float, gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT, gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT ) -> GapPadResult: """ fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval * ignores gaps with duration less than or equal to packet length * gap_lower_limit * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of packet length :param packet_data: list of tuples, each tuple containing two pieces of packet information: packet_start_timestamps; float of packet start timestamp in microseconds and audio_data; pa.Table of data points :param sample_interval_micros: sample interval in microseconds :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet, default DEFAULT_GAP_UPPER_LIMIT :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT :return: table without gaps and the list of timestamps of the non-inclusive start and end of the gaps """ result_array = [[], [], []] last_data_timestamp: Optional[float] = None gaps = [] result = GapPadResult() for packet in packet_data: samples_in_packet = packet[1].num_rows start_ts = packet[0] packet_length = sample_interval_micros * samples_in_packet if last_data_timestamp: last_data_timestamp += sample_interval_micros # check if start_ts is close to the last timestamp in data_timestamps last_timestamp_diff = start_ts - last_data_timestamp if last_timestamp_diff > gap_lower_limit * packet_length: fractional_packet, num_packets = modf(last_timestamp_diff / (samples_in_packet * sample_interval_micros)) if fractional_packet >= gap_upper_limit: num_samples = samples_in_packet * (num_packets + 1) else: num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1]) gap_ts = calc_evenly_sampled_timestamps(last_data_timestamp, num_samples, sample_interval_micros) gap_array = [gap_ts, np.full(len(gap_ts), np.nan)] start_ts = gap_ts[-1] + sample_interval_micros gaps.append((last_data_timestamp, start_ts)) result_array[0].extend(gap_array[0]) result_array[1].extend(gap_array[0]) result_array[2].extend(gap_array[1]) elif last_timestamp_diff < -gap_lower_limit * packet_length: result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} " f"is before last timestamp of previous " f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}") # return result estimated_ts = calc_evenly_sampled_timestamps(start_ts, samples_in_packet, sample_interval_micros) last_data_timestamp = estimated_ts[-1] result_array[0].extend(estimated_ts) result_array[1].extend(estimated_ts) result_array[2].extend(packet[1]["microphone"].to_numpy()) result.result = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array))) result.gaps = gaps return result
def fill_audio_gaps2(packet_data: List[Tuple[float, pyarrow.lib.Table]], sample_interval_micros: float, gap_upper_limit: float = 0.8, gap_lower_limit: float = 0.02) ‑> AudioWithGaps
-
fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval * ignores gaps with duration less than or equal to packet length * gap_lower_limit * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of packet length
:param packet_data: list of tuples, each tuple containing two pieces of packet information: packet_start_timestamps; float of packet start timestamp in microseconds and audio_data; pa.Table of data points :param sample_interval_micros: sample interval in microseconds :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet, default DEFAULT_GAP_UPPER_LIMIT :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT :return: list of timestamps of the non-inclusive start and end of the gaps
Expand source code
def fill_audio_gaps2( packet_data: List[Tuple[float, pa.Table]], sample_interval_micros: float, gap_upper_limit: float = DEFAULT_GAP_UPPER_LIMIT, gap_lower_limit: float = DEFAULT_GAP_LOWER_LIMIT ) -> AudioWithGaps: """ fills gaps in the table with np.nan by interpolating timestamps based on the expected sample interval * ignores gaps with duration less than or equal to packet length * gap_lower_limit * converts gaps with duration greater than or equal to packet length * gap_upper_limit into a multiple of packet length :param packet_data: list of tuples, each tuple containing two pieces of packet information: packet_start_timestamps; float of packet start timestamp in microseconds and audio_data; pa.Table of data points :param sample_interval_micros: sample interval in microseconds :param gap_upper_limit: percentage of packet length required to confirm gap is at least 1 packet, default DEFAULT_GAP_UPPER_LIMIT :param gap_lower_limit: percentage of packet length required to disregard gap, default DEFAULT_GAP_LOWER_LIMIT :return: list of timestamps of the non-inclusive start and end of the gaps """ last_data_timestamp = packet_data[0][0] gaps = [] result = AudioWithGaps(sample_interval_micros, packet_data) for packet in packet_data: samples_in_packet = packet[1].num_rows start_ts = packet[0] packet_length = sample_interval_micros * samples_in_packet # check if start_ts is close to the last timestamp in data_timestamps last_timestamp_diff = start_ts - last_data_timestamp if last_timestamp_diff > gap_lower_limit * packet_length: fractional_packet, num_packets = modf(last_timestamp_diff / (samples_in_packet * sample_interval_micros)) if fractional_packet >= gap_upper_limit: num_samples = samples_in_packet * (num_packets + 1) else: num_samples = np.max([np.floor((fractional_packet + num_packets) * samples_in_packet), 1]) start_ts = last_data_timestamp + (num_samples * sample_interval_micros) + sample_interval_micros gaps.append((last_data_timestamp, start_ts)) elif last_timestamp_diff < -gap_lower_limit * packet_length: result.add_error(f"Packet start timestamp: {dtu.microseconds_to_seconds(start_ts)} " f"is before last timestamp of previous " f"packet: {dtu.microseconds_to_seconds(last_data_timestamp)}") last_data_timestamp += (samples_in_packet + 1) * sample_interval_micros result.gaps = gaps return result
def fill_gaps(arrow_df: pyarrow.lib.Table, gaps: List[Tuple[float, float]], sample_interval_micros: float, copy: bool = False) ‑> Tuple[pyarrow.lib.Table, List[Tuple[float, float]]]
-
fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the calculated sample interval
:param arrow_df: pyarrow table with data. first column is "timestamps" :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps :param sample_interval_micros: known sample interval of the data points :param copy: if True, copy the data points, otherwise interpolate from edges, default False :return: table without gaps and the list of gaps
Expand source code
def fill_gaps( arrow_df: pa.Table, gaps: List[Tuple[float, float]], sample_interval_micros: float, copy: bool = False ) -> Tuple[pa.Table, List[Tuple[float, float]]]: """ fills gaps in the table with np.nan or interpolated values by interpolating timestamps based on the calculated sample interval :param arrow_df: pyarrow table with data. first column is "timestamps" :param gaps: list of tuples of known non-inclusive start and end timestamps of the gaps :param sample_interval_micros: known sample interval of the data points :param copy: if True, copy the data points, otherwise interpolate from edges, default False :return: table without gaps and the list of gaps """ # extract the necessary information to compute gap size and gap timestamps data_time_stamps = arrow_df["timestamps"].to_numpy() if len(data_time_stamps) > 1: data_duration = data_time_stamps[-1] - data_time_stamps[0] expected_samples = (np.floor(data_duration / sample_interval_micros) + (1 if data_duration % sample_interval_micros >= sample_interval_micros * DEFAULT_GAP_UPPER_LIMIT else 0)) + 1 if expected_samples > len(data_time_stamps): if copy: pcm = DataPointCreationMode["COPY"] else: pcm = DataPointCreationMode["NAN"] # make it safe to alter the gap values my_gaps = check_gap_list(gaps, data_time_stamps[0], data_time_stamps[-1]) for gap in my_gaps: # if timestamps are around gaps, we have to update the values before_start = np.argwhere([t <= gap[0] for t in data_time_stamps]) after_end = np.argwhere([t >= gap[1] for t in data_time_stamps]) if len(before_start) > 0: before_start = before_start[-1][0] # sim = gap[0] - data_time_stamps[before_start] # result_df = add_data_points_to_df(result_df, before_start, sim, point_creation_mode=pcm) gap = (data_time_stamps[before_start], gap[1]) else: before_start = None if len(after_end) > 0: after_end = after_end[0][0] # sim = gap[1] - data_time_stamps[after_end] gap = (gap[0], data_time_stamps[after_end]) else: after_end = None num_new_points = int((gap[1] - gap[0]) / sample_interval_micros) - 1 if before_start is not None: arrow_df = add_data_points_to_df(arrow_df, before_start, sample_interval_micros, num_new_points, pcm) elif after_end is not None: arrow_df = add_data_points_to_df(arrow_df, after_end, -sample_interval_micros, num_new_points, pcm) indic = pc.sort_indices(arrow_df, sort_keys=[("timestamps", "ascending")]) return arrow_df.take(indic), gaps return arrow_df, gaps
Classes
class AudioWithGaps (sample_interval_micros: float, metadata: Optional[List[Tuple[float, pyarrow.lib.Table]]] = None, gaps: List[Tuple[float, float]] = <factory>, errors: RedVoxExceptions = <factory>)
-
Represents methods of reconstructing audio data with or without gaps in it
Properties
sample_interval_micros: microseconds between sample points
metadata: list of start times in microseconds since epoch UTC and the data to add
gaps: the list of start and end points of gaps (the start and end are actual data points)
errors: the errors encountered while getting the data
Expand source code
@dataclass_json @dataclass class AudioWithGaps: """ Represents methods of reconstructing audio data with or without gaps in it Properties: sample_interval_micros: microseconds between sample points metadata: list of start times in microseconds since epoch UTC and the data to add gaps: the list of start and end points of gaps (the start and end are actual data points) errors: the errors encountered while getting the data """ sample_interval_micros: float metadata: Optional[List[Tuple[float, pa.Table]]] = None gaps: List[Tuple[float, float]] = field(default_factory=lambda: []) errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("AudioWithGaps")) def create_timestamps(self) -> pa.Table: """ :return: converts the audio metadata into a data table """ result_array = [[], [], []] for m in self.metadata: timestamps = calc_evenly_sampled_timestamps(m[0], m[1].num_rows, self.sample_interval_micros) result_array[0].extend(timestamps) result_array[1].extend(timestamps) result_array[2].extend(m[1]["microphone"].to_numpy()) for gs, ge in self.gaps: num_samples = int((ge - gs) / self.sample_interval_micros) - 1 timestamps = calc_evenly_sampled_timestamps(gs + self.sample_interval_micros, num_samples, self.sample_interval_micros) gap_array = [timestamps, np.full(len(timestamps), np.nan)] result_array[0].extend(gap_array[0]) result_array[1].extend(gap_array[0]) result_array[2].extend(gap_array[1]) ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array))) return pc.take(ptable, pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
Class variables
var errors : RedVoxExceptions
var gaps : List[Tuple[float, float]]
var metadata : Optional[List[Tuple[float, pyarrow.lib.Table]]]
var sample_interval_micros : float
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
-
Expand source code
@classmethod def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A: return _decode_dataclass(cls, kvs, infer_missing)
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
-
Expand source code
@classmethod def from_json(cls: Type[A], s: JsonData, *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) -> A: kvs = json.loads(s, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant, **kw) return cls.from_dict(kvs, infer_missing=infer_missing)
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
-
Expand source code
@classmethod def schema(cls: Type[A], *, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) -> SchemaType: Schema = build_schema(cls, DataClassJsonMixin, infer_missing, partial) if unknown is None: undefined_parameter_action = _undefined_parameter_action_safe(cls) if undefined_parameter_action is not None: # We can just make use of the same-named mm keywords unknown = undefined_parameter_action.name.lower() return Schema(only=only, exclude=exclude, many=many, context=context, load_only=load_only, dump_only=dump_only, partial=partial, unknown=unknown)
Methods
def create_timestamps(self) ‑> pyarrow.lib.Table
-
:return: converts the audio metadata into a data table
Expand source code
def create_timestamps(self) -> pa.Table: """ :return: converts the audio metadata into a data table """ result_array = [[], [], []] for m in self.metadata: timestamps = calc_evenly_sampled_timestamps(m[0], m[1].num_rows, self.sample_interval_micros) result_array[0].extend(timestamps) result_array[1].extend(timestamps) result_array[2].extend(m[1]["microphone"].to_numpy()) for gs, ge in self.gaps: num_samples = int((ge - gs) / self.sample_interval_micros) - 1 timestamps = calc_evenly_sampled_timestamps(gs + self.sample_interval_micros, num_samples, self.sample_interval_micros) gap_array = [timestamps, np.full(len(timestamps), np.nan)] result_array[0].extend(gap_array[0]) result_array[1].extend(gap_array[0]) result_array[2].extend(gap_array[1]) ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array))) return pc.take(ptable, pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")]))
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
-
Expand source code
def to_dict(self, encode_json=False) -> Dict[str, Json]: return _asdict(self, encode_json=encode_json)
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) ‑> str
-
Expand source code
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Optional[Union[int, str]] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) -> str: return json.dumps(self.to_dict(encode_json=False), cls=_ExtendedEncoder, skipkeys=skipkeys, ensure_ascii=ensure_ascii, check_circular=check_circular, allow_nan=allow_nan, indent=indent, separators=separators, default=default, sort_keys=sort_keys, **kw)
class DataPointCreationMode (value, names=None, *, module=None, qualname=None, type=None, start=1)
-
Type of data point to create
Expand source code
class DataPointCreationMode(enum.Enum): """ Type of data point to create """ NAN: int = 0 COPY: int = 1 INTERPOLATE: int = 2 @staticmethod def list_names() -> List[str]: return [n.name for n in DataPointCreationMode]
Ancestors
- enum.Enum
Class variables
var COPY : int
var INTERPOLATE : int
var NAN : int
Static methods
def list_names() ‑> List[str]
-
Expand source code
@staticmethod def list_names() -> List[str]: return [n.name for n in DataPointCreationMode]
class GapPadResult (result: Optional[pyarrow.lib.Table] = None, gaps: List[Tuple[float, float]] = <factory>, errors: RedVoxExceptions = <factory>)
-
The result of filling gaps or padding a time series
Expand source code
@dataclass_json @dataclass class GapPadResult: """ The result of filling gaps or padding a time series """ result: Optional[pa.Table] = None gaps: List[Tuple[float, float]] = field(default_factory=lambda: []) errors: RedVoxExceptions = field(default_factory=lambda: RedVoxExceptions("GapPadResult")) def add_error(self, error: str): """ add an error to the result :param error: error message to add """ self.errors.append(error)
Class variables
var errors : RedVoxExceptions
var gaps : List[Tuple[float, float]]
var result : Optional[pyarrow.lib.Table]
Static methods
def from_dict(kvs: Union[dict, list, str, int, float, bool, ForwardRef(None)], *, infer_missing=False) ‑> ~A
-
Expand source code
@classmethod def from_dict(cls: Type[A], kvs: Json, *, infer_missing=False) -> A: return _decode_dataclass(cls, kvs, infer_missing)
def from_json(s: Union[str, bytes, bytearray], *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) ‑> ~A
-
Expand source code
@classmethod def from_json(cls: Type[A], s: JsonData, *, parse_float=None, parse_int=None, parse_constant=None, infer_missing=False, **kw) -> A: kvs = json.loads(s, parse_float=parse_float, parse_int=parse_int, parse_constant=parse_constant, **kw) return cls.from_dict(kvs, infer_missing=infer_missing)
def schema(*, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) ‑> dataclasses_json.mm.SchemaF[~A]
-
Expand source code
@classmethod def schema(cls: Type[A], *, infer_missing: bool = False, only=None, exclude=(), many: bool = False, context=None, load_only=(), dump_only=(), partial: bool = False, unknown=None) -> SchemaType: Schema = build_schema(cls, DataClassJsonMixin, infer_missing, partial) if unknown is None: undefined_parameter_action = _undefined_parameter_action_safe(cls) if undefined_parameter_action is not None: # We can just make use of the same-named mm keywords unknown = undefined_parameter_action.name.lower() return Schema(only=only, exclude=exclude, many=many, context=context, load_only=load_only, dump_only=dump_only, partial=partial, unknown=unknown)
Methods
def add_error(self, error: str)
-
add an error to the result :param error: error message to add
Expand source code
def add_error(self, error: str): """ add an error to the result :param error: error message to add """ self.errors.append(error)
def to_dict(self, encode_json=False) ‑> Dict[str, Union[dict, list, str, int, float, bool, ForwardRef(None)]]
-
Expand source code
def to_dict(self, encode_json=False) -> Dict[str, Json]: return _asdict(self, encode_json=encode_json)
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Union[int, str, ForwardRef(None)] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) ‑> str
-
Expand source code
def to_json(self, *, skipkeys: bool = False, ensure_ascii: bool = True, check_circular: bool = True, allow_nan: bool = True, indent: Optional[Union[int, str]] = None, separators: Tuple[str, str] = None, default: Callable = None, sort_keys: bool = False, **kw) -> str: return json.dumps(self.to_dict(encode_json=False), cls=_ExtendedEncoder, skipkeys=skipkeys, ensure_ascii=ensure_ascii, check_circular=check_circular, allow_nan=allow_nan, indent=indent, separators=separators, default=default, sort_keys=sort_keys, **kw)