Edit on GitHub

sqlmesh.schedulers.airflow.operators.hwm_sensor

 1import logging
 2import typing as t
 3from datetime import datetime
 4
 5from airflow.models import DagRun
 6from airflow.sensors.base import BaseSensorOperator
 7from airflow.utils.context import Context
 8
 9from sqlmesh.core.snapshot import Snapshot, SnapshotTableInfo
10from sqlmesh.schedulers.airflow import util
11from sqlmesh.utils.date import to_datetime
12
13logger = logging.getLogger(__name__)
14
15
16class HighWaterMarkSensor(BaseSensorOperator):
17    def __init__(
18        self,
19        target_snapshot_info: SnapshotTableInfo,
20        this_snapshot: Snapshot,
21        poke_interval: float = 60.0,
22        timeout: float = 7.0 * 24.0 * 60.0 * 60.0,  # 7 days
23        mode: str = "reschedule",
24        **kwargs: t.Any,
25    ) -> None:
26        super().__init__(
27            poke_interval=poke_interval,
28            timeout=timeout,
29            mode=mode,
30            **kwargs,
31        )
32        self.target_snapshot_info = target_snapshot_info
33        self.this_snapshot = this_snapshot
34
35    def poke(self, context: Context) -> bool:
36        dag_run = context["dag_run"]
37
38        target_snapshot = self._get_target_snapshot()
39        if target_snapshot.intervals:
40            current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1])
41        else:
42            current_high_water_mark = None
43
44        target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot)
45
46        logger.info(
47            "The current high water mark for snapshot %s is '%s' (target is '%s')",
48            self.target_snapshot_info.snapshot_id,
49            current_high_water_mark,
50            target_high_water_mark,
51        )
52        if current_high_water_mark is not None:
53            return current_high_water_mark >= target_high_water_mark
54        return False
55
56    def _compute_target_high_water_mark(
57        self, dag_run: DagRun, target_snapshot: Snapshot
58    ) -> datetime:
59        target_date = to_datetime(dag_run.data_interval_end)
60        target_prev = to_datetime(target_snapshot.model.cron_floor(target_date))
61        this_prev = to_datetime(self.this_snapshot.model.cron_floor(target_date))
62        return min(target_prev, this_prev)
63
64    def _get_target_snapshot(self) -> Snapshot:
65        with util.scoped_state_sync() as state_sync:
66            target_snapshots = state_sync.get_snapshots_with_same_version(
67                [self.target_snapshot_info]
68            )
69            return Snapshot.merge_snapshots(
70                [self.target_snapshot_info],
71                {s.snapshot_id: s for s in target_snapshots},
72            )[0]
class HighWaterMarkSensor(airflow.sensors.base.BaseSensorOperator):
17class HighWaterMarkSensor(BaseSensorOperator):
18    def __init__(
19        self,
20        target_snapshot_info: SnapshotTableInfo,
21        this_snapshot: Snapshot,
22        poke_interval: float = 60.0,
23        timeout: float = 7.0 * 24.0 * 60.0 * 60.0,  # 7 days
24        mode: str = "reschedule",
25        **kwargs: t.Any,
26    ) -> None:
27        super().__init__(
28            poke_interval=poke_interval,
29            timeout=timeout,
30            mode=mode,
31            **kwargs,
32        )
33        self.target_snapshot_info = target_snapshot_info
34        self.this_snapshot = this_snapshot
35
36    def poke(self, context: Context) -> bool:
37        dag_run = context["dag_run"]
38
39        target_snapshot = self._get_target_snapshot()
40        if target_snapshot.intervals:
41            current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1])
42        else:
43            current_high_water_mark = None
44
45        target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot)
46
47        logger.info(
48            "The current high water mark for snapshot %s is '%s' (target is '%s')",
49            self.target_snapshot_info.snapshot_id,
50            current_high_water_mark,
51            target_high_water_mark,
52        )
53        if current_high_water_mark is not None:
54            return current_high_water_mark >= target_high_water_mark
55        return False
56
57    def _compute_target_high_water_mark(
58        self, dag_run: DagRun, target_snapshot: Snapshot
59    ) -> datetime:
60        target_date = to_datetime(dag_run.data_interval_end)
61        target_prev = to_datetime(target_snapshot.model.cron_floor(target_date))
62        this_prev = to_datetime(self.this_snapshot.model.cron_floor(target_date))
63        return min(target_prev, this_prev)
64
65    def _get_target_snapshot(self) -> Snapshot:
66        with util.scoped_state_sync() as state_sync:
67            target_snapshots = state_sync.get_snapshots_with_same_version(
68                [self.target_snapshot_info]
69            )
70            return Snapshot.merge_snapshots(
71                [self.target_snapshot_info],
72                {s.snapshot_id: s for s in target_snapshots},
73            )[0]

Sensor operators are derived from this class and inherit these attributes.

Sensor operators keep executing at a time interval and succeed when a criteria is met and fail if and when they time out.

Parameters
  • soft_fail: Set to true to mark the task as SKIPPED on failure
  • poke_interval: Time in seconds that the job should wait in between each tries
  • timeout: Time, in seconds before the task times out and fails.
  • mode: How the sensor operates. Options are: { poke | reschedule }, default is poke. When set to poke the sensor is taking up a worker slot for its whole execution time and sleeps between pokes. Use this mode if the expected runtime of the sensor is short or if a short poke interval is required. Note that the sensor will hold onto a worker slot and a pool slot for the duration of the sensor's runtime in this mode. When set to reschedule the sensor task frees the worker slot when the criteria is not yet met and it's rescheduled at a later time. Use this mode if the time before the criteria is met is expected to be quite long. The poke interval should be more than one minute to prevent too much load on the scheduler.
  • exponential_backoff: allow progressive longer waits between pokes by using exponential backoff algorithm
HighWaterMarkSensor( target_snapshot_info: sqlmesh.core.snapshot.definition.SnapshotTableInfo, this_snapshot: sqlmesh.core.snapshot.definition.Snapshot, poke_interval: float = 60.0, timeout: float = 604800.0, mode: str = 'reschedule', **kwargs: Any)
18    def __init__(
19        self,
20        target_snapshot_info: SnapshotTableInfo,
21        this_snapshot: Snapshot,
22        poke_interval: float = 60.0,
23        timeout: float = 7.0 * 24.0 * 60.0 * 60.0,  # 7 days
24        mode: str = "reschedule",
25        **kwargs: t.Any,
26    ) -> None:
27        super().__init__(
28            poke_interval=poke_interval,
29            timeout=timeout,
30            mode=mode,
31            **kwargs,
32        )
33        self.target_snapshot_info = target_snapshot_info
34        self.this_snapshot = this_snapshot
def poke(self, context: airflow.utils.context.Context) -> bool:
36    def poke(self, context: Context) -> bool:
37        dag_run = context["dag_run"]
38
39        target_snapshot = self._get_target_snapshot()
40        if target_snapshot.intervals:
41            current_high_water_mark = to_datetime(target_snapshot.intervals[-1][1])
42        else:
43            current_high_water_mark = None
44
45        target_high_water_mark = self._compute_target_high_water_mark(dag_run, target_snapshot)
46
47        logger.info(
48            "The current high water mark for snapshot %s is '%s' (target is '%s')",
49            self.target_snapshot_info.snapshot_id,
50            current_high_water_mark,
51            target_high_water_mark,
52        )
53        if current_high_water_mark is not None:
54            return current_high_water_mark >= target_high_water_mark
55        return False

Function that the sensors defined while deriving this class should override.

Inherited Members
airflow.sensors.base.BaseSensorOperator
deps
is_smart_sensor_compatible
register_in_sensor_service
get_poke_context
get_execution_context
execute
prepare_for_execution
reschedule
get_serialized_fields
airflow.models.baseoperator.BaseOperator
partial
task_group
add_inlets
add_outlets
get_inlet_defs
get_outlet_defs
get_dag
dag
has_dag
set_xcomargs_dependencies
pre_execute
post_execute
on_kill
render_template_fields
clear
get_task_instances
run
dry_run
get_direct_relatives
task_type
roots
leaves
output
xcom_push
xcom_pull
serialize_for_task_group
inherits_from_empty_operator
defer
validate_mapped_arguments
unmap
airflow.models.abstractoperator.AbstractOperator
dag_id
get_template_env
prepare_template
resolve_template_files
get_direct_relative_ids
get_flat_relative_ids
get_flat_relatives
priority_weight_total
render_template
airflow.models.skipmixin.SkipMixin
skip
skip_all_except
airflow.utils.log.logging_mixin.LoggingMixin
log
airflow.models.taskmixin.DAGNode
set_downstream
set_upstream
downstream_list
upstream_list
iter_mapped_dependants
airflow.models.taskmixin.DependencyMixin
update_relative