Source code for sensortoolkit.qc._interval_downsampling

# -*- coding: utf-8 -*-
"""
This module contains methods for applying downsampling methods to convert
timeseries datasets at unevenly spaced sampling intervals to a uniform, evenly
spaced interval (the downsampling interval).

The downsampling interval is determined based on the variation in the orginally
recorded dataset. If, for instance, a sensor was set to record at 60 second
intervals but the interval between consecutively recorded timestamps
varied from 60 +/- 20 seconds, data may be downsampled to 120 second averages.

================================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Tue Aug 17 10:55:08 2021
Last Updated:
  Tue Aug 17 10:55:08 2021
"""
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


[docs]def sensor_timedelta(df_list, serials): """ Create dataframe with time deltas (in seconds) between subsequent data entries. Args: df_list (list): List of sensor dataframes at original, recorded sampling frequency. serials (dict): A dictionary of sensor serial identifiers for each unit in a testing group. Returns: delta_df (pandas DataFrame): A dataset containing the intervals in seconds between consecutive timestamps in recorded datasets. Each column corresponds to the time delta intervals for datasets within the passed df_list. """ delta_df = pd.DataFrame() for serial_id, df in zip(serials.values(), df_list): delta = pd.Series((df.index[1:] - df.index[0:-1]).seconds) delta_df[serial_id] = delta median = delta.median() total_len = delta.shape[0] delta_drift = delta[delta != median] delta_drift_len = delta_drift.shape[0] print('..Sensor median time delta:', median, 'seconds') print('....Percent of sensor dataset not recorded at median time ' 'delta:', '{:3.1f}'.format(100*(delta_drift_len/total_len)) + '%') return delta_df
[docs]def plot_recording_interval(delta_df): """Create a plot of the time delta interval vs. row index for recorded datasets. This plot indicates whether the recording interval varied in datasets during the duration of measurements. Args: delta_df (pandas DataFrame): A dataset containing the intervals in seconds between consecutive timestamps in recorded datasets. Each column corresponds to the time delta intervals for datasets within the passed df_list. Returns: None. """ fig, ax = plt.subplots(1,1, figsize=(10, 6)) for col in delta_df: delta_df[col].plot(ax=ax) ax.set_xlabel('Row Index') ax.set_ylabel('Recording interval delta (s)')
[docs]def timedelta_quantiles(df_list, serials): """Create quantile dataframe of time deltas for each dataset the in passed list of datasets (``df_list``). Args: df_list (list): List of sensor dataframes at original, recorded sampling frequency. serials (dict): A dictionary of sensor serial identifiers for each unit in a testing group. Returns: quant_df (pandas DataFrame): Dataset containing the time delta interval for measurements (for each dataset in the passed df_list) listed by quantile, ranging from 0 to 1 in 0.001 (0.1%) increments. The 0.5 quantile (50th percentile) corresponds to the median of time delta intervals for each dataset. """ delta_df = sensor_timedelta(df_list, serials) quant = lambda x, header: delta_df[header].quantile(x) quant_df = pd.DataFrame() for col in delta_df: quant_data = quant(np.arange(0, 1, 0.001), col) quant_df[col] = quant_data quant_df.index = quant_df.index.values.round(3) return quant_df
[docs]def downsampling_interval(quant_df, thres_quant=0.99, plot_quantiles=True): """Check if N times the median time delta is greater than the time delta at a threshold quantile (default is 99%) for each dataframe. Say we have the following scenario where a sensor was configured to record data at 60 second intervals but the recording interval occasionally drifted to shorter or longer intervals: - threshold quantile (``'thres_quant'``) = 0.99 (99th percentile) - threshold recording interval (recording interval at the 99th percentile) = 115 seconds - median recording interval (recording interval at the 50th percentile) = 60 seconds On the first iteration of the downsampling_interval() method, the function will check whether 1*60 seconds is greater than the threshold recording interval. Since 60 < 132 seconds, this is not true, so the method will step the multipliying factor up by 1. The second iteration will check whether 2*60 seconds is greater than the theshold recording interval. Since this is true (120 > 115 seconds), the loop will exit and indicate that the dataset should be downsampled to 120 second intervals. Args: quant_df (pandas DataFrame): Dataset containing the time delta interval for measurements (for each dataset in the passed df_list) listed by quantile, ranging from 0 to 1 in 0.001 (0.1%) increments. The 0.5 quantile (50th percentile) corresponds to the median of time delta intervals for each dataset. thres_quant (float, optional): A threshold quantile (normalized between 0 and 1) for the distribution of time deltas in recorded datasets. Downsampling is applied for time delta intervals that are the first multiple of the median time delta that exceeds the time delta corresponding to the threshold quantile. Defaults to 0.99. plot_quantiles (bool): If True, create a figure displaying the distribution of time delta intervals in recorded datasets (relative frequency of recorded time deltas within each quantile interval vs. the time delta of consecutive recorded timestamps). Defaults to True. Returns: interval (int or float): The downsampling interval, in seconds. """ mult = 1 exceed = False median = quant_df.loc[.50, :] while exceed is False: exceed = all(mult*median > quant_df.loc[thres_quant, :]) if all(mult*median == quant_df.loc[thres_quant, :]): exceed = True mult += 1 mult -= 1 interval = mult*median.mode()[0] if mult != 1: print('A downsampling interval of ' + str(interval) + ' seconds exceeds the ' + '{:3.2f}'.format(thres_quant) + ' percentile for sensor recording intervals.') else: print('The ' + '{:3.2f}'.format(thres_quant) + ' percentile for ' 'sensor recording intervals equals the median recording interval' '. Recommend proceeding with median recording interval.') if plot_quantiles: plot_timedelta_quantiles(quant_df, interval, thres_quant) return interval
[docs]def apply_downsampling(df_list, downsampling_interval): """Helper function for applying downsampled averaging to datasets for the passed downsampling interval. Args: df_list (list): List of sensor dataframes at original, recorded sampling frequency. downsampling_interval (int or float): The downsampling interval, in seconds. Returns: df_list (list): Modified list of sensor dataframes with downsampled, uniformly spaced timestamp intervals. """ # Downsample to mean param values for every downsampled interval interval = pd.to_timedelta(downsampling_interval, unit='s') for i, df in enumerate(df_list): df = df[:].groupby([pd.Grouper(freq=interval)]).mean() df_list[i] = df return df_list
[docs]def plot_timedelta_quantiles(quant_df, interval, thres_quant=0.99): """ Plot timedelta vs. quantile, indicate the threshold quantile and downsampling interval by gray dashed lines. Args: quant_df (pandas DataFrame): Dataset containing the time delta interval for measurements (for each dataset in the passed df_list) listed by quantile, ranging from 0 to 1 in 0.001 (0.1%) increments. The 0.5 quantile (50th percentile) corresponds to the median of time delta intervals for each dataset. interval (int or float): The downsampling interval, in seconds. thres_quant (float, optional): A threshold quantile (normalized between 0 and 1) for the distribution of time deltas in recorded datasets. Downsampling is applied for time delta intervals that are the first multiple of the median time delta that exceeds the time delta corresponding to the threshold quantile. Defaults to 0.99. Returns: None. """ fig, ax = plt.subplots(1,1, figsize=(7, 7)) quant_df.plot(ax=ax) quant_label = 'Quantile Threshold (' +str(thres_quant)+')' ax.axvline(x=thres_quant, label=quant_label, color='gray', linestyle='--', linewidth=2) downsample_label = 'Downsampling Interval (' + str(interval) + ' s)' ax.axhline(y=interval, label=downsample_label, color='k', linestyle=':', linewidth=3) ax.legend(fontsize=11) ax.set_xlabel('Quantile (%)', fontsize=12) ax.set_ylabel('Recording Interval Time Delta (s)', fontsize=12) ax.tick_params(axis='both', labelsize=11)