Edit on GitHub

sqlmesh.core.scheduler

  1from __future__ import annotations
  2
  3import logging
  4import typing as t
  5from datetime import datetime
  6
  7from sqlmesh.core.console import Console, get_console
  8from sqlmesh.core.snapshot import (
  9    Snapshot,
 10    SnapshotEvaluator,
 11    SnapshotId,
 12    SnapshotIdLike,
 13)
 14from sqlmesh.core.state_sync import StateSync
 15from sqlmesh.utils import format_exception
 16from sqlmesh.utils.concurrency import concurrent_apply_to_dag
 17from sqlmesh.utils.dag import DAG
 18from sqlmesh.utils.date import (
 19    TimeLike,
 20    now,
 21    to_datetime,
 22    validate_date_range,
 23    yesterday,
 24)
 25
 26logger = logging.getLogger(__name__)
 27Interval = t.Tuple[datetime, datetime]
 28Batch = t.List[Interval]
 29SnapshotToBatches = t.Dict[Snapshot, Batch]
 30SchedulingUnit = t.Tuple[Snapshot, Interval]
 31
 32
 33class Scheduler:
 34    """Schedules and manages the evaluation of snapshots.
 35
 36    The scheduler evaluates multiple snapshots with date intervals in the correct
 37    topological order. It consults the state sync to understand what intervals for each
 38    snapshot needs to be backfilled.
 39
 40    The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine.
 41
 42    Args:
 43        snapshots: A collection of snapshots.
 44        snapshot_evaluator: The snapshot evaluator to execute queries.
 45        state_sync: The state sync to pull saved snapshots.
 46        max_workers: The maximum number of parallel queries to run.
 47        console: The rich instance used for printing scheduling information.
 48    """
 49
 50    def __init__(
 51        self,
 52        snapshots: t.Iterable[Snapshot],
 53        snapshot_evaluator: SnapshotEvaluator,
 54        state_sync: StateSync,
 55        max_workers: int = 1,
 56        console: t.Optional[Console] = None,
 57    ):
 58        self.snapshots = {s.snapshot_id: s for s in snapshots}
 59        self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots)
 60        self.snapshot_evaluator = snapshot_evaluator
 61        self.state_sync = state_sync
 62        self.max_workers = max_workers
 63        self.console: Console = console or get_console()
 64
 65    def batches(
 66        self,
 67        start: t.Optional[TimeLike] = None,
 68        end: t.Optional[TimeLike] = None,
 69        latest: t.Optional[TimeLike] = None,
 70        is_dev: bool = False,
 71    ) -> SnapshotToBatches:
 72        """Returns a list of snapshot batches to evaluate.
 73
 74        Args:
 75            start: The start of the run. Defaults to the min model start date.
 76            end: The end of the run. Defaults to now.
 77            latest: The latest datetime to use for non-incremental queries.
 78            is_dev: Indicates whether the evaluation happens in the development mode and temporary
 79                tables / table clones should be used where applicable.
 80        """
 81        validate_date_range(start, end)
 82
 83        return self._interval_params(
 84            self.snapshot_per_version.values(),
 85            start,
 86            end,
 87            latest,
 88            is_dev=is_dev,
 89        )
 90
 91    def evaluate(
 92        self,
 93        snapshot: Snapshot,
 94        start: TimeLike,
 95        end: TimeLike,
 96        latest: TimeLike,
 97        is_dev: bool = False,
 98        **kwargs: t.Any,
 99    ) -> None:
100        """Evaluate a snapshot and add the processed interval to the state sync.
101
102        Args:
103            snapshot: Snapshot to evaluate.
104            start: The start datetime to render.
105            end: The end datetime to render.
106            latest: The latest datetime to use for non-incremental queries.
107            is_dev: Indicates whether the evaluation happens in the development mode and temporary
108                tables / table clones should be used where applicable.
109            kwargs: Additional kwargs to pass to the renderer.
110        """
111        validate_date_range(start, end)
112
113        snapshots = {
114            **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents},
115            snapshot.name: snapshot,
116        }
117
118        self.snapshot_evaluator.evaluate(
119            snapshot,
120            start,
121            end,
122            latest,
123            snapshots=snapshots,
124            is_dev=is_dev,
125            **kwargs,
126        )
127        self.snapshot_evaluator.audit(
128            snapshot=snapshot,
129            start=start,
130            end=end,
131            latest=latest,
132            snapshots=snapshots,
133            is_dev=is_dev,
134            **kwargs,
135        )
136        self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev)
137        self.console.update_snapshot_progress(snapshot.name, 1)
138
139    def run(
140        self,
141        start: t.Optional[TimeLike] = None,
142        end: t.Optional[TimeLike] = None,
143        latest: t.Optional[TimeLike] = None,
144        is_dev: bool = False,
145    ) -> bool:
146        """Concurrently runs all snapshots in topological order.
147
148        Args:
149            start: The start of the run. Defaults to the min model start date.
150            end: The end of the run. Defaults to now.
151            latest: The latest datetime to use for non-incremental queries.
152            is_dev: Indicates whether the evaluation happens in the development mode and temporary
153                tables / table clones should be used where applicable.
154
155        Returns:
156            True if the execution was successful and False otherwise.
157        """
158        validate_date_range(start, end)
159
160        latest = latest or now()
161        batches = self.batches(start, end, latest, is_dev=is_dev)
162        dag = self._dag(batches)
163
164        visited = set()
165        for snapshot, _ in dag.sorted():
166            if snapshot in visited:
167                continue
168            visited.add(snapshot)
169            intervals = batches[snapshot]
170            self.console.start_snapshot_progress(snapshot.name, len(intervals))
171
172        def evaluate_node(node: SchedulingUnit) -> None:
173            assert latest
174            snapshot, (start, end) = node
175            self.evaluate(snapshot, start, end, latest, is_dev=is_dev)
176
177        with self.snapshot_evaluator.concurrent_context():
178            errors, skipped_intervals = concurrent_apply_to_dag(
179                dag,
180                evaluate_node,
181                self.max_workers,
182                raise_on_error=False,
183            )
184
185        self.console.stop_snapshot_progress(success=not errors)
186
187        for error in errors:
188            sid = error.node[0]
189            formatted_exception = "".join(format_exception(error.__cause__ or error))
190            self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}")
191
192        skipped_snapshots = {i[0] for i in skipped_intervals}
193        for skipped in skipped_snapshots:
194            self.console.log_status_update(f"SKIPPED snapshot {skipped}\n")
195
196        return not errors
197
198    def _interval_params(
199        self,
200        snapshots: t.Iterable[Snapshot],
201        start: t.Optional[TimeLike] = None,
202        end: t.Optional[TimeLike] = None,
203        latest: t.Optional[TimeLike] = None,
204        is_dev: bool = False,
205    ) -> SnapshotToBatches:
206        """Find the optimal date interval paramaters based on what needs processing and maximal batch size.
207
208        For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found,
209        calculate the missing intervals that need to be processed given the passed in start and end intervals.
210
211        If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than
212        or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression.
213        For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs
214        with 30 days and 1 job with 10.
215
216        Args:
217            snapshots: The list of snapshots.
218            start: Start of the interval.
219            end: End of the interval.
220            latest: The latest datetime to use for non-incremental queries.
221            is_dev: Indicates whether the evaluation happens in the development mode.
222
223        Returns:
224            A list of tuples containing all snapshots needing to be run with their associated interval params.
225        """
226        all_snapshots = {s.snapshot_id: s for s in self.snapshots.values()}
227
228        # When in development mode only consider intervals of the current forward-only snapshot and ignore
229        # intervals of all snapshots with the same version that came before it.
230        same_version_snapshots = (
231            [s for s in snapshots if not s.is_forward_only or not s.is_paused]
232            if is_dev
233            else snapshots
234        )
235        stored_snapshots = self.state_sync.get_snapshots_with_same_version(same_version_snapshots)
236        all_snapshots.update({s.snapshot_id: s for s in stored_snapshots})
237
238        return compute_interval_params(
239            snapshots,
240            snapshots=all_snapshots,
241            start=start or earliest_start_date(snapshots),
242            end=end or now(),
243            latest=latest or now(),
244        )
245
246    def _dag(self, batches: SnapshotToBatches) -> DAG[SchedulingUnit]:
247        """Builds a DAG of snapshot intervals to be evaluated.
248
249        Args:
250            batches: The batches of snapshots and intervals to evaluate.
251
252        Returns:
253            A DAG of snapshot intervals to be evaluated.
254        """
255
256        intervals_per_snapshot_version = {
257            (snapshot.name, snapshot.version_get_or_generate()): intervals
258            for snapshot, intervals in batches.items()
259        }
260
261        dag = DAG[SchedulingUnit]()
262        for snapshot, intervals in batches.items():
263            if not intervals:
264                continue
265            upstream_dependencies = [
266                (self.snapshots[p_sid], interval)
267                for p_sid in snapshot.parents
268                if p_sid in self.snapshots
269                for interval in intervals_per_snapshot_version.get(
270                    (
271                        self.snapshots[p_sid].name,
272                        self.snapshots[p_sid].version_get_or_generate(),
273                    ),
274                    [],
275                )
276            ]
277            for i, interval in enumerate(intervals):
278                dag.add((snapshot, interval), upstream_dependencies)
279                if snapshot.is_incremental_by_unique_key_kind:
280                    dag.add(
281                        (snapshot, interval),
282                        [(snapshot, _interval) for _interval in intervals[:i]],
283                    )
284
285        return dag
286
287
288def compute_interval_params(
289    target: t.Iterable[SnapshotIdLike],
290    *,
291    snapshots: t.Dict[SnapshotId, Snapshot],
292    start: TimeLike,
293    end: TimeLike,
294    latest: TimeLike,
295) -> SnapshotToBatches:
296    """Find the optimal date interval paramaters based on what needs processing and maximal batch size.
297
298    For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found,
299    calculate the missing intervals that need to be processed given the passed in start and end intervals.
300
301    If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than
302    or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression.
303    For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs
304    with 30 days and 1 job with 10.
305
306    Args:
307        target: A set of target snapshots for which intervals should be computed.
308        snapshots: A catalog of all available snapshots (including the target ones).
309        start: Start of the interval.
310        end: End of the interval.
311        latest: The latest datetime to use for non-incremental queries.
312
313    Returns:
314        A dict containing all snapshots needing to be run with their associated interval params.
315    """
316    start_dt = to_datetime(start)
317
318    snapshots_to_batches = {}
319
320    for snapshot in Snapshot.merge_snapshots(target, snapshots):
321        model_start_dt = max(start_date(snapshot, snapshots.values()) or start_dt, start_dt)
322        snapshots_to_batches[snapshot] = [
323            (to_datetime(s), to_datetime(e))
324            for s, e in snapshot.missing_intervals(model_start_dt, end, latest)
325        ]
326
327    return _batched_intervals(snapshots_to_batches)
328
329
330def start_date(
331    snapshot: Snapshot, snapshots: t.Dict[SnapshotId, Snapshot] | t.Iterable[Snapshot]
332) -> t.Optional[datetime]:
333    """Get the effective/inferred start date for a snapshot.
334
335    Not all snapshots define a start date. In those cases, the model's start date
336    can be inferred from its parent's start date.
337
338    Args:
339        snapshot: snapshot to infer start date.
340        snapshots: a catalog of available snapshots.
341
342    Returns:
343        Start datetime object.
344    """
345    if snapshot.model.start:
346        return to_datetime(snapshot.model.start)
347
348    if not isinstance(snapshots, dict):
349        snapshots = {snapshot.snapshot_id: snapshot for snapshot in snapshots}
350
351    earliest = None
352
353    for parent in snapshot.parents:
354        if parent not in snapshots:
355            continue
356
357        start_dt = start_date(snapshots[parent], snapshots)
358
359        if not earliest:
360            earliest = start_dt
361        elif start_dt:
362            earliest = min(earliest, start_dt)
363
364    return earliest
365
366
367def earliest_start_date(snapshots: t.Iterable[Snapshot]) -> datetime:
368    """Get the earliest start date from a collection of snapshots.
369
370    Args:
371        snapshots: Snapshots to find earliest start date.
372    Returns:
373        The earliest start date or yesterday if none is found."""
374    snapshots = list(snapshots)
375    if snapshots:
376        return min(start_date(snapshot, snapshots) or yesterday() for snapshot in snapshots)
377    return yesterday()
378
379
380def _batched_intervals(params: SnapshotToBatches) -> SnapshotToBatches:
381    batches = {}
382
383    for snapshot, intervals in params.items():
384        batch_size = snapshot.model.batch_size
385        batches_for_snapshot = []
386        next_batch: t.List[Interval] = []
387        for interval in intervals:
388            if (batch_size and len(next_batch) >= batch_size) or (
389                next_batch and interval[0] != next_batch[-1][-1]
390            ):
391                batches_for_snapshot.append((next_batch[0][0], next_batch[-1][-1]))
392                next_batch = []
393            next_batch.append(interval)
394        if next_batch:
395            batches_for_snapshot.append((next_batch[0][0], next_batch[-1][-1]))
396        batches[snapshot] = batches_for_snapshot
397
398    return batches
399
400
401def _resolve_one_snapshot_per_version(
402    snapshots: t.Iterable[Snapshot],
403) -> t.Dict[t.Tuple[str, str], Snapshot]:
404    snapshot_per_version: t.Dict[t.Tuple[str, str], Snapshot] = {}
405    for snapshot in snapshots:
406        key = (snapshot.name, snapshot.version_get_or_generate())
407        if key not in snapshot_per_version:
408            snapshot_per_version[key] = snapshot
409        else:
410            prev_snapshot = snapshot_per_version[key]
411            if snapshot.unpaused_ts and (
412                not prev_snapshot.unpaused_ts or snapshot.created_ts > prev_snapshot.created_ts
413            ):
414                snapshot_per_version[key] = snapshot
415
416    return snapshot_per_version
class Scheduler:
 34class Scheduler:
 35    """Schedules and manages the evaluation of snapshots.
 36
 37    The scheduler evaluates multiple snapshots with date intervals in the correct
 38    topological order. It consults the state sync to understand what intervals for each
 39    snapshot needs to be backfilled.
 40
 41    The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine.
 42
 43    Args:
 44        snapshots: A collection of snapshots.
 45        snapshot_evaluator: The snapshot evaluator to execute queries.
 46        state_sync: The state sync to pull saved snapshots.
 47        max_workers: The maximum number of parallel queries to run.
 48        console: The rich instance used for printing scheduling information.
 49    """
 50
 51    def __init__(
 52        self,
 53        snapshots: t.Iterable[Snapshot],
 54        snapshot_evaluator: SnapshotEvaluator,
 55        state_sync: StateSync,
 56        max_workers: int = 1,
 57        console: t.Optional[Console] = None,
 58    ):
 59        self.snapshots = {s.snapshot_id: s for s in snapshots}
 60        self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots)
 61        self.snapshot_evaluator = snapshot_evaluator
 62        self.state_sync = state_sync
 63        self.max_workers = max_workers
 64        self.console: Console = console or get_console()
 65
 66    def batches(
 67        self,
 68        start: t.Optional[TimeLike] = None,
 69        end: t.Optional[TimeLike] = None,
 70        latest: t.Optional[TimeLike] = None,
 71        is_dev: bool = False,
 72    ) -> SnapshotToBatches:
 73        """Returns a list of snapshot batches to evaluate.
 74
 75        Args:
 76            start: The start of the run. Defaults to the min model start date.
 77            end: The end of the run. Defaults to now.
 78            latest: The latest datetime to use for non-incremental queries.
 79            is_dev: Indicates whether the evaluation happens in the development mode and temporary
 80                tables / table clones should be used where applicable.
 81        """
 82        validate_date_range(start, end)
 83
 84        return self._interval_params(
 85            self.snapshot_per_version.values(),
 86            start,
 87            end,
 88            latest,
 89            is_dev=is_dev,
 90        )
 91
 92    def evaluate(
 93        self,
 94        snapshot: Snapshot,
 95        start: TimeLike,
 96        end: TimeLike,
 97        latest: TimeLike,
 98        is_dev: bool = False,
 99        **kwargs: t.Any,
100    ) -> None:
101        """Evaluate a snapshot and add the processed interval to the state sync.
102
103        Args:
104            snapshot: Snapshot to evaluate.
105            start: The start datetime to render.
106            end: The end datetime to render.
107            latest: The latest datetime to use for non-incremental queries.
108            is_dev: Indicates whether the evaluation happens in the development mode and temporary
109                tables / table clones should be used where applicable.
110            kwargs: Additional kwargs to pass to the renderer.
111        """
112        validate_date_range(start, end)
113
114        snapshots = {
115            **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents},
116            snapshot.name: snapshot,
117        }
118
119        self.snapshot_evaluator.evaluate(
120            snapshot,
121            start,
122            end,
123            latest,
124            snapshots=snapshots,
125            is_dev=is_dev,
126            **kwargs,
127        )
128        self.snapshot_evaluator.audit(
129            snapshot=snapshot,
130            start=start,
131            end=end,
132            latest=latest,
133            snapshots=snapshots,
134            is_dev=is_dev,
135            **kwargs,
136        )
137        self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev)
138        self.console.update_snapshot_progress(snapshot.name, 1)
139
140    def run(
141        self,
142        start: t.Optional[TimeLike] = None,
143        end: t.Optional[TimeLike] = None,
144        latest: t.Optional[TimeLike] = None,
145        is_dev: bool = False,
146    ) -> bool:
147        """Concurrently runs all snapshots in topological order.
148
149        Args:
150            start: The start of the run. Defaults to the min model start date.
151            end: The end of the run. Defaults to now.
152            latest: The latest datetime to use for non-incremental queries.
153            is_dev: Indicates whether the evaluation happens in the development mode and temporary
154                tables / table clones should be used where applicable.
155
156        Returns:
157            True if the execution was successful and False otherwise.
158        """
159        validate_date_range(start, end)
160
161        latest = latest or now()
162        batches = self.batches(start, end, latest, is_dev=is_dev)
163        dag = self._dag(batches)
164
165        visited = set()
166        for snapshot, _ in dag.sorted():
167            if snapshot in visited:
168                continue
169            visited.add(snapshot)
170            intervals = batches[snapshot]
171            self.console.start_snapshot_progress(snapshot.name, len(intervals))
172
173        def evaluate_node(node: SchedulingUnit) -> None:
174            assert latest
175            snapshot, (start, end) = node
176            self.evaluate(snapshot, start, end, latest, is_dev=is_dev)
177
178        with self.snapshot_evaluator.concurrent_context():
179            errors, skipped_intervals = concurrent_apply_to_dag(
180                dag,
181                evaluate_node,
182                self.max_workers,
183                raise_on_error=False,
184            )
185
186        self.console.stop_snapshot_progress(success=not errors)
187
188        for error in errors:
189            sid = error.node[0]
190            formatted_exception = "".join(format_exception(error.__cause__ or error))
191            self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}")
192
193        skipped_snapshots = {i[0] for i in skipped_intervals}
194        for skipped in skipped_snapshots:
195            self.console.log_status_update(f"SKIPPED snapshot {skipped}\n")
196
197        return not errors
198
199    def _interval_params(
200        self,
201        snapshots: t.Iterable[Snapshot],
202        start: t.Optional[TimeLike] = None,
203        end: t.Optional[TimeLike] = None,
204        latest: t.Optional[TimeLike] = None,
205        is_dev: bool = False,
206    ) -> SnapshotToBatches:
207        """Find the optimal date interval paramaters based on what needs processing and maximal batch size.
208
209        For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found,
210        calculate the missing intervals that need to be processed given the passed in start and end intervals.
211
212        If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than
213        or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression.
214        For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs
215        with 30 days and 1 job with 10.
216
217        Args:
218            snapshots: The list of snapshots.
219            start: Start of the interval.
220            end: End of the interval.
221            latest: The latest datetime to use for non-incremental queries.
222            is_dev: Indicates whether the evaluation happens in the development mode.
223
224        Returns:
225            A list of tuples containing all snapshots needing to be run with their associated interval params.
226        """
227        all_snapshots = {s.snapshot_id: s for s in self.snapshots.values()}
228
229        # When in development mode only consider intervals of the current forward-only snapshot and ignore
230        # intervals of all snapshots with the same version that came before it.
231        same_version_snapshots = (
232            [s for s in snapshots if not s.is_forward_only or not s.is_paused]
233            if is_dev
234            else snapshots
235        )
236        stored_snapshots = self.state_sync.get_snapshots_with_same_version(same_version_snapshots)
237        all_snapshots.update({s.snapshot_id: s for s in stored_snapshots})
238
239        return compute_interval_params(
240            snapshots,
241            snapshots=all_snapshots,
242            start=start or earliest_start_date(snapshots),
243            end=end or now(),
244            latest=latest or now(),
245        )
246
247    def _dag(self, batches: SnapshotToBatches) -> DAG[SchedulingUnit]:
248        """Builds a DAG of snapshot intervals to be evaluated.
249
250        Args:
251            batches: The batches of snapshots and intervals to evaluate.
252
253        Returns:
254            A DAG of snapshot intervals to be evaluated.
255        """
256
257        intervals_per_snapshot_version = {
258            (snapshot.name, snapshot.version_get_or_generate()): intervals
259            for snapshot, intervals in batches.items()
260        }
261
262        dag = DAG[SchedulingUnit]()
263        for snapshot, intervals in batches.items():
264            if not intervals:
265                continue
266            upstream_dependencies = [
267                (self.snapshots[p_sid], interval)
268                for p_sid in snapshot.parents
269                if p_sid in self.snapshots
270                for interval in intervals_per_snapshot_version.get(
271                    (
272                        self.snapshots[p_sid].name,
273                        self.snapshots[p_sid].version_get_or_generate(),
274                    ),
275                    [],
276                )
277            ]
278            for i, interval in enumerate(intervals):
279                dag.add((snapshot, interval), upstream_dependencies)
280                if snapshot.is_incremental_by_unique_key_kind:
281                    dag.add(
282                        (snapshot, interval),
283                        [(snapshot, _interval) for _interval in intervals[:i]],
284                    )
285
286        return dag

Schedules and manages the evaluation of snapshots.

The scheduler evaluates multiple snapshots with date intervals in the correct topological order. It consults the state sync to understand what intervals for each snapshot needs to be backfilled.

The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine.

Arguments:
  • snapshots: A collection of snapshots.
  • snapshot_evaluator: The snapshot evaluator to execute queries.
  • state_sync: The state sync to pull saved snapshots.
  • max_workers: The maximum number of parallel queries to run.
  • console: The rich instance used for printing scheduling information.
Scheduler( snapshots: Iterable[sqlmesh.core.snapshot.definition.Snapshot], snapshot_evaluator: sqlmesh.core.snapshot.evaluator.SnapshotEvaluator, state_sync: sqlmesh.core.state_sync.base.StateSync, max_workers: int = 1, console: Optional[sqlmesh.core.console.Console] = None)
51    def __init__(
52        self,
53        snapshots: t.Iterable[Snapshot],
54        snapshot_evaluator: SnapshotEvaluator,
55        state_sync: StateSync,
56        max_workers: int = 1,
57        console: t.Optional[Console] = None,
58    ):
59        self.snapshots = {s.snapshot_id: s for s in snapshots}
60        self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots)
61        self.snapshot_evaluator = snapshot_evaluator
62        self.state_sync = state_sync
63        self.max_workers = max_workers
64        self.console: Console = console or get_console()
def batches( self, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, latest: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, is_dev: bool = False) -> Dict[sqlmesh.core.snapshot.definition.Snapshot, List[Tuple[datetime.datetime, datetime.datetime]]]:
66    def batches(
67        self,
68        start: t.Optional[TimeLike] = None,
69        end: t.Optional[TimeLike] = None,
70        latest: t.Optional[TimeLike] = None,
71        is_dev: bool = False,
72    ) -> SnapshotToBatches:
73        """Returns a list of snapshot batches to evaluate.
74
75        Args:
76            start: The start of the run. Defaults to the min model start date.
77            end: The end of the run. Defaults to now.
78            latest: The latest datetime to use for non-incremental queries.
79            is_dev: Indicates whether the evaluation happens in the development mode and temporary
80                tables / table clones should be used where applicable.
81        """
82        validate_date_range(start, end)
83
84        return self._interval_params(
85            self.snapshot_per_version.values(),
86            start,
87            end,
88            latest,
89            is_dev=is_dev,
90        )

Returns a list of snapshot batches to evaluate.

Arguments:
  • start: The start of the run. Defaults to the min model start date.
  • end: The end of the run. Defaults to now.
  • latest: The latest datetime to use for non-incremental queries.
  • is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
def evaluate( self, snapshot: sqlmesh.core.snapshot.definition.Snapshot, start: Union[datetime.date, datetime.datetime, str, int, float], end: Union[datetime.date, datetime.datetime, str, int, float], latest: Union[datetime.date, datetime.datetime, str, int, float], is_dev: bool = False, **kwargs: Any) -> None:
 92    def evaluate(
 93        self,
 94        snapshot: Snapshot,
 95        start: TimeLike,
 96        end: TimeLike,
 97        latest: TimeLike,
 98        is_dev: bool = False,
 99        **kwargs: t.Any,
100    ) -> None:
101        """Evaluate a snapshot and add the processed interval to the state sync.
102
103        Args:
104            snapshot: Snapshot to evaluate.
105            start: The start datetime to render.
106            end: The end datetime to render.
107            latest: The latest datetime to use for non-incremental queries.
108            is_dev: Indicates whether the evaluation happens in the development mode and temporary
109                tables / table clones should be used where applicable.
110            kwargs: Additional kwargs to pass to the renderer.
111        """
112        validate_date_range(start, end)
113
114        snapshots = {
115            **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents},
116            snapshot.name: snapshot,
117        }
118
119        self.snapshot_evaluator.evaluate(
120            snapshot,
121            start,
122            end,
123            latest,
124            snapshots=snapshots,
125            is_dev=is_dev,
126            **kwargs,
127        )
128        self.snapshot_evaluator.audit(
129            snapshot=snapshot,
130            start=start,
131            end=end,
132            latest=latest,
133            snapshots=snapshots,
134            is_dev=is_dev,
135            **kwargs,
136        )
137        self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev)
138        self.console.update_snapshot_progress(snapshot.name, 1)

Evaluate a snapshot and add the processed interval to the state sync.

Arguments:
  • snapshot: Snapshot to evaluate.
  • start: The start datetime to render.
  • end: The end datetime to render.
  • latest: The latest datetime to use for non-incremental queries.
  • is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
  • kwargs: Additional kwargs to pass to the renderer.
def run( self, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, latest: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, is_dev: bool = False) -> bool:
140    def run(
141        self,
142        start: t.Optional[TimeLike] = None,
143        end: t.Optional[TimeLike] = None,
144        latest: t.Optional[TimeLike] = None,
145        is_dev: bool = False,
146    ) -> bool:
147        """Concurrently runs all snapshots in topological order.
148
149        Args:
150            start: The start of the run. Defaults to the min model start date.
151            end: The end of the run. Defaults to now.
152            latest: The latest datetime to use for non-incremental queries.
153            is_dev: Indicates whether the evaluation happens in the development mode and temporary
154                tables / table clones should be used where applicable.
155
156        Returns:
157            True if the execution was successful and False otherwise.
158        """
159        validate_date_range(start, end)
160
161        latest = latest or now()
162        batches = self.batches(start, end, latest, is_dev=is_dev)
163        dag = self._dag(batches)
164
165        visited = set()
166        for snapshot, _ in dag.sorted():
167            if snapshot in visited:
168                continue
169            visited.add(snapshot)
170            intervals = batches[snapshot]
171            self.console.start_snapshot_progress(snapshot.name, len(intervals))
172
173        def evaluate_node(node: SchedulingUnit) -> None:
174            assert latest
175            snapshot, (start, end) = node
176            self.evaluate(snapshot, start, end, latest, is_dev=is_dev)
177
178        with self.snapshot_evaluator.concurrent_context():
179            errors, skipped_intervals = concurrent_apply_to_dag(
180                dag,
181                evaluate_node,
182                self.max_workers,
183                raise_on_error=False,
184            )
185
186        self.console.stop_snapshot_progress(success=not errors)
187
188        for error in errors:
189            sid = error.node[0]
190            formatted_exception = "".join(format_exception(error.__cause__ or error))
191            self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}")
192
193        skipped_snapshots = {i[0] for i in skipped_intervals}
194        for skipped in skipped_snapshots:
195            self.console.log_status_update(f"SKIPPED snapshot {skipped}\n")
196
197        return not errors

Concurrently runs all snapshots in topological order.

Arguments:
  • start: The start of the run. Defaults to the min model start date.
  • end: The end of the run. Defaults to now.
  • latest: The latest datetime to use for non-incremental queries.
  • is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
Returns:

True if the execution was successful and False otherwise.

def compute_interval_params( target: Iterable[Union[sqlmesh.core.snapshot.definition.SnapshotId, sqlmesh.core.snapshot.definition.SnapshotTableInfo, sqlmesh.core.snapshot.definition.Snapshot]], *, snapshots: Dict[sqlmesh.core.snapshot.definition.SnapshotId, sqlmesh.core.snapshot.definition.Snapshot], start: Union[datetime.date, datetime.datetime, str, int, float], end: Union[datetime.date, datetime.datetime, str, int, float], latest: Union[datetime.date, datetime.datetime, str, int, float]) -> Dict[sqlmesh.core.snapshot.definition.Snapshot, List[Tuple[datetime.datetime, datetime.datetime]]]:
289def compute_interval_params(
290    target: t.Iterable[SnapshotIdLike],
291    *,
292    snapshots: t.Dict[SnapshotId, Snapshot],
293    start: TimeLike,
294    end: TimeLike,
295    latest: TimeLike,
296) -> SnapshotToBatches:
297    """Find the optimal date interval paramaters based on what needs processing and maximal batch size.
298
299    For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found,
300    calculate the missing intervals that need to be processed given the passed in start and end intervals.
301
302    If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than
303    or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression.
304    For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs
305    with 30 days and 1 job with 10.
306
307    Args:
308        target: A set of target snapshots for which intervals should be computed.
309        snapshots: A catalog of all available snapshots (including the target ones).
310        start: Start of the interval.
311        end: End of the interval.
312        latest: The latest datetime to use for non-incremental queries.
313
314    Returns:
315        A dict containing all snapshots needing to be run with their associated interval params.
316    """
317    start_dt = to_datetime(start)
318
319    snapshots_to_batches = {}
320
321    for snapshot in Snapshot.merge_snapshots(target, snapshots):
322        model_start_dt = max(start_date(snapshot, snapshots.values()) or start_dt, start_dt)
323        snapshots_to_batches[snapshot] = [
324            (to_datetime(s), to_datetime(e))
325            for s, e in snapshot.missing_intervals(model_start_dt, end, latest)
326        ]
327
328    return _batched_intervals(snapshots_to_batches)

Find the optimal date interval paramaters based on what needs processing and maximal batch size.

For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, calculate the missing intervals that need to be processed given the passed in start and end intervals.

If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs with 30 days and 1 job with 10.

Arguments:
  • target: A set of target snapshots for which intervals should be computed.
  • snapshots: A catalog of all available snapshots (including the target ones).
  • start: Start of the interval.
  • end: End of the interval.
  • latest: The latest datetime to use for non-incremental queries.
Returns:

A dict containing all snapshots needing to be run with their associated interval params.

def start_date( snapshot: sqlmesh.core.snapshot.definition.Snapshot, snapshots: Union[Dict[sqlmesh.core.snapshot.definition.SnapshotId, sqlmesh.core.snapshot.definition.Snapshot], Iterable[sqlmesh.core.snapshot.definition.Snapshot]]) -> Optional[datetime.datetime]:
331def start_date(
332    snapshot: Snapshot, snapshots: t.Dict[SnapshotId, Snapshot] | t.Iterable[Snapshot]
333) -> t.Optional[datetime]:
334    """Get the effective/inferred start date for a snapshot.
335
336    Not all snapshots define a start date. In those cases, the model's start date
337    can be inferred from its parent's start date.
338
339    Args:
340        snapshot: snapshot to infer start date.
341        snapshots: a catalog of available snapshots.
342
343    Returns:
344        Start datetime object.
345    """
346    if snapshot.model.start:
347        return to_datetime(snapshot.model.start)
348
349    if not isinstance(snapshots, dict):
350        snapshots = {snapshot.snapshot_id: snapshot for snapshot in snapshots}
351
352    earliest = None
353
354    for parent in snapshot.parents:
355        if parent not in snapshots:
356            continue
357
358        start_dt = start_date(snapshots[parent], snapshots)
359
360        if not earliest:
361            earliest = start_dt
362        elif start_dt:
363            earliest = min(earliest, start_dt)
364
365    return earliest

Get the effective/inferred start date for a snapshot.

Not all snapshots define a start date. In those cases, the model's start date can be inferred from its parent's start date.

Arguments:
  • snapshot: snapshot to infer start date.
  • snapshots: a catalog of available snapshots.
Returns:

Start datetime object.

def earliest_start_date( snapshots: Iterable[sqlmesh.core.snapshot.definition.Snapshot]) -> datetime.datetime:
368def earliest_start_date(snapshots: t.Iterable[Snapshot]) -> datetime:
369    """Get the earliest start date from a collection of snapshots.
370
371    Args:
372        snapshots: Snapshots to find earliest start date.
373    Returns:
374        The earliest start date or yesterday if none is found."""
375    snapshots = list(snapshots)
376    if snapshots:
377        return min(start_date(snapshot, snapshots) or yesterday() for snapshot in snapshots)
378    return yesterday()

Get the earliest start date from a collection of snapshots.

Arguments:
  • snapshots: Snapshots to find earliest start date.
Returns:

The earliest start date or yesterday if none is found.