sqlmesh.schedulers.airflow.operators.hwm_sensor
1import logging 2import typing as t 3from datetime import datetime 4 5from airflow.models import DagRun 6from airflow.sensors.base import BaseSensorOperator 7from airflow.utils.context import Context 8 9from sqlmesh.core.snapshot import Snapshot, SnapshotTableInfo 10from sqlmesh.schedulers.airflow import util 11from sqlmesh.utils.date import to_datetime 12 13logger = logging.getLogger(__name__) 14 15 16class HighWaterMarkSensor(BaseSensorOperator): 17 def __init__( 18 self, 19 target_snapshot_info: SnapshotTableInfo, 20 this_snapshot: Snapshot, 21 poke_interval: float = 60.0, 22 timeout: float = 7.0 * 24.0 * 60.0 * 60.0, # 7 days 23 mode: str = "reschedule", 24 **kwargs: t.Any, 25 ) -> None: 26 super().__init__( 27 poke_interval=poke_interval, 28 timeout=timeout, 29 mode=mode, 30 **kwargs, 31 ) 32 self.target_snapshot_info = target_snapshot_info 33 self.this_snapshot = this_snapshot 34 35 def poke(self, context: Context) -> bool: 36 dag_run = context["dag_run"] 37 38 target_snapshot = self._get_target_snapshot() 39 if target_snapshot.intervals: 40 current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1]) 41 else: 42 current_high_water_mark = None 43 44 target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot) 45 46 logger.info( 47 "The current high water mark for snapshot %s is '%s' (target is '%s')", 48 self.target_snapshot_info.snapshot_id, 49 current_high_water_mark, 50 target_high_water_mark, 51 ) 52 if current_high_water_mark is not None: 53 return current_high_water_mark >= target_high_water_mark 54 return False 55 56 def _compute_target_high_water_mark( 57 self, dag_run: DagRun, target_snapshot: Snapshot 58 ) -> datetime: 59 target_date = to_datetime(dag_run.data_interval_end) 60 target_prev = to_datetime(target_snapshot.model.cron_floor(target_date)) 61 this_prev = to_datetime(self.this_snapshot.model.cron_floor(target_date)) 62 return min(target_prev, this_prev) 63 64 def _get_target_snapshot(self) -> Snapshot: 65 with util.scoped_state_sync() as state_sync: 66 target_snapshots = state_sync.get_snapshots_with_same_version( 67 [self.target_snapshot_info] 68 ) 69 return Snapshot.merge_snapshots( 70 [self.target_snapshot_info], 71 {s.snapshot_id: s for s in target_snapshots}, 72 )[0]
class
HighWaterMarkSensor(airflow.sensors.base.BaseSensorOperator):
17class HighWaterMarkSensor(BaseSensorOperator): 18 def __init__( 19 self, 20 target_snapshot_info: SnapshotTableInfo, 21 this_snapshot: Snapshot, 22 poke_interval: float = 60.0, 23 timeout: float = 7.0 * 24.0 * 60.0 * 60.0, # 7 days 24 mode: str = "reschedule", 25 **kwargs: t.Any, 26 ) -> None: 27 super().__init__( 28 poke_interval=poke_interval, 29 timeout=timeout, 30 mode=mode, 31 **kwargs, 32 ) 33 self.target_snapshot_info = target_snapshot_info 34 self.this_snapshot = this_snapshot 35 36 def poke(self, context: Context) -> bool: 37 dag_run = context["dag_run"] 38 39 target_snapshot = self._get_target_snapshot() 40 if target_snapshot.intervals: 41 current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1]) 42 else: 43 current_high_water_mark = None 44 45 target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot) 46 47 logger.info( 48 "The current high water mark for snapshot %s is '%s' (target is '%s')", 49 self.target_snapshot_info.snapshot_id, 50 current_high_water_mark, 51 target_high_water_mark, 52 ) 53 if current_high_water_mark is not None: 54 return current_high_water_mark >= target_high_water_mark 55 return False 56 57 def _compute_target_high_water_mark( 58 self, dag_run: DagRun, target_snapshot: Snapshot 59 ) -> datetime: 60 target_date = to_datetime(dag_run.data_interval_end) 61 target_prev = to_datetime(target_snapshot.model.cron_floor(target_date)) 62 this_prev = to_datetime(self.this_snapshot.model.cron_floor(target_date)) 63 return min(target_prev, this_prev) 64 65 def _get_target_snapshot(self) -> Snapshot: 66 with util.scoped_state_sync() as state_sync: 67 target_snapshots = state_sync.get_snapshots_with_same_version( 68 [self.target_snapshot_info] 69 ) 70 return Snapshot.merge_snapshots( 71 [self.target_snapshot_info], 72 {s.snapshot_id: s for s in target_snapshots}, 73 )[0]
Sensor operators are derived from this class and inherit these attributes.
Sensor operators keep executing at a time interval and succeed when a criteria is met and fail if and when they time out.
Parameters
- soft_fail: Set to true to mark the task as SKIPPED on failure
- poke_interval: Time in seconds that the job should wait in between each tries
- timeout: Time, in seconds before the task times out and fails.
- mode: How the sensor operates.
Options are:
{ poke | reschedule }
, default ispoke
. When set topoke
the sensor is taking up a worker slot for its whole execution time and sleeps between pokes. Use this mode if the expected runtime of the sensor is short or if a short poke interval is required. Note that the sensor will hold onto a worker slot and a pool slot for the duration of the sensor's runtime in this mode. When set toreschedule
the sensor task frees the worker slot when the criteria is not yet met and it's rescheduled at a later time. Use this mode if the time before the criteria is met is expected to be quite long. The poke interval should be more than one minute to prevent too much load on the scheduler. - exponential_backoff: allow progressive longer waits between pokes by using exponential backoff algorithm
HighWaterMarkSensor( target_snapshot_info: sqlmesh.core.snapshot.definition.SnapshotTableInfo, this_snapshot: sqlmesh.core.snapshot.definition.Snapshot, poke_interval: float = 60.0, timeout: float = 604800.0, mode: str = 'reschedule', **kwargs: Any)
18 def __init__( 19 self, 20 target_snapshot_info: SnapshotTableInfo, 21 this_snapshot: Snapshot, 22 poke_interval: float = 60.0, 23 timeout: float = 7.0 * 24.0 * 60.0 * 60.0, # 7 days 24 mode: str = "reschedule", 25 **kwargs: t.Any, 26 ) -> None: 27 super().__init__( 28 poke_interval=poke_interval, 29 timeout=timeout, 30 mode=mode, 31 **kwargs, 32 ) 33 self.target_snapshot_info = target_snapshot_info 34 self.this_snapshot = this_snapshot
def
poke(self, context: airflow.utils.context.Context) -> bool:
36 def poke(self, context: Context) -> bool: 37 dag_run = context["dag_run"] 38 39 target_snapshot = self._get_target_snapshot() 40 if target_snapshot.intervals: 41 current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1]) 42 else: 43 current_high_water_mark = None 44 45 target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot) 46 47 logger.info( 48 "The current high water mark for snapshot %s is '%s' (target is '%s')", 49 self.target_snapshot_info.snapshot_id, 50 current_high_water_mark, 51 target_high_water_mark, 52 ) 53 if current_high_water_mark is not None: 54 return current_high_water_mark >= target_high_water_mark 55 return False
Function that the sensors defined while deriving this class should override.
Inherited Members
- airflow.sensors.base.BaseSensorOperator
- deps
- is_smart_sensor_compatible
- register_in_sensor_service
- get_poke_context
- get_execution_context
- execute
- prepare_for_execution
- reschedule
- get_serialized_fields
- airflow.models.baseoperator.BaseOperator
- partial
- task_group
- add_inlets
- add_outlets
- get_inlet_defs
- get_outlet_defs
- get_dag
- dag
- has_dag
- set_xcomargs_dependencies
- pre_execute
- post_execute
- on_kill
- render_template_fields
- clear
- get_task_instances
- run
- dry_run
- get_direct_relatives
- task_type
- roots
- leaves
- output
- xcom_push
- xcom_pull
- serialize_for_task_group
- inherits_from_empty_operator
- defer
- validate_mapped_arguments
- unmap
- airflow.models.abstractoperator.AbstractOperator
- dag_id
- get_template_env
- prepare_template
- resolve_template_files
- get_direct_relative_ids
- get_flat_relative_ids
- get_flat_relatives
- priority_weight_total
- operator_extra_link_dict
- global_operator_extra_link_dict
- get_extra_links
- render_template
- airflow.models.skipmixin.SkipMixin
- skip
- skip_all_except
- airflow.utils.log.logging_mixin.LoggingMixin
- log
- airflow.models.taskmixin.DAGNode
- set_downstream
- set_upstream
- downstream_list
- upstream_list
- iter_mapped_dependants
- airflow.models.taskmixin.DependencyMixin
- update_relative