Edit on GitHub

sqlmesh.core.plan.definition

  1from __future__ import annotations
  2
  3import typing as t
  4from collections import defaultdict
  5from enum import Enum
  6
  7from sqlmesh.core import scheduler
  8from sqlmesh.core.config import CategorizerConfig
  9from sqlmesh.core.context_diff import ContextDiff
 10from sqlmesh.core.environment import Environment
 11from sqlmesh.core.snapshot import (
 12    Intervals,
 13    Snapshot,
 14    SnapshotChangeCategory,
 15    SnapshotId,
 16    categorize_change,
 17    merge_intervals,
 18)
 19from sqlmesh.core.state_sync import StateReader
 20from sqlmesh.utils import random_id
 21from sqlmesh.utils.dag import DAG
 22from sqlmesh.utils.date import (
 23    TimeLike,
 24    make_inclusive,
 25    now,
 26    to_ds,
 27    to_timestamp,
 28    validate_date_range,
 29    yesterday_ds,
 30)
 31from sqlmesh.utils.errors import PlanError, SQLMeshError
 32from sqlmesh.utils.pydantic import PydanticModel
 33
 34SnapshotMapping = t.Dict[str, t.Set[str]]
 35
 36
 37class Plan:
 38    """Plan is the main class to represent user choices on how they want to backfill and version their models.
 39
 40    Args:
 41        context_diff: The context diff that the plan is based on.
 42        dag: The dag object to determine relationships.
 43        state_reader: The state_reader to get metadata with.
 44        start: The start time to backfill data.
 45        end: The end time to backfill data.
 46        apply: The callback to apply the plan.
 47        restate_models: A list of models for which the data should be restated for the time range
 48            specified in this plan. Note: models defined outside SQLMesh (external) won't be a part
 49            of the restatement.
 50        no_gaps:  Whether to ensure that new snapshots for models that are already a
 51            part of the target environment have no data gaps when compared against previous
 52            snapshots for same models.
 53        skip_backfill: Whether to skip the backfill step.
 54        is_dev: Whether this plan is for development purposes.
 55        forward_only: Whether the purpose of the plan is to make forward only changes.
 56        environment_ttl: The period of time that a development environment should exist before being deleted.
 57        categorizer_config: Auto categorization settings.
 58        auto_categorization_enabled: Whether to apply auto categorization.
 59    """
 60
 61    def __init__(
 62        self,
 63        context_diff: ContextDiff,
 64        dag: DAG,
 65        state_reader: StateReader,
 66        start: t.Optional[TimeLike] = None,
 67        end: t.Optional[TimeLike] = None,
 68        apply: t.Optional[t.Callable[[Plan], None]] = None,
 69        restate_models: t.Optional[t.Iterable[str]] = None,
 70        no_gaps: bool = False,
 71        skip_backfill: bool = False,
 72        is_dev: bool = False,
 73        forward_only: bool = False,
 74        environment_ttl: t.Optional[str] = None,
 75        categorizer_config: t.Optional[CategorizerConfig] = None,
 76        auto_categorization_enabled: bool = True,
 77    ):
 78        self.context_diff = context_diff
 79        self.override_start = start is not None
 80        self.override_end = end is not None
 81        self.plan_id: str = random_id()
 82        self.no_gaps = no_gaps
 83        self.skip_backfill = skip_backfill
 84        self.is_dev = is_dev
 85        self.forward_only = forward_only
 86        self.environment_ttl = environment_ttl
 87        self.categorizer_config = categorizer_config or CategorizerConfig()
 88        self.auto_categorization_enabled = auto_categorization_enabled
 89        self._start = start if start or not (is_dev and forward_only) else yesterday_ds()
 90        self._end = end if end or not is_dev else now()
 91        self._apply = apply
 92        self._dag = dag
 93        self._state_reader = state_reader
 94        self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None
 95        self._restatements: t.Set[str] = set()
 96
 97        if restate_models and context_diff.new_snapshots:
 98            raise PlanError(
 99                "Model changes and restatements can't be a part of the same plan. "
100                "Revert or apply changes before proceeding with restatements."
101            )
102
103        if not restate_models and is_dev and forward_only:
104            # Add model names for new forward-only snapshots to the restatement list
105            # in order to compute previews.
106            restate_models = [
107                s.name for s in context_diff.new_snapshots.values() if s.is_materialized
108            ]
109
110        self._add_restatements(restate_models or [])
111
112        self._ensure_valid_date_range(self._start, self._end)
113        self._ensure_no_forward_only_revert()
114        self._ensure_no_forward_only_new_models()
115
116        directly_indirectly_modified = self._build_directly_and_indirectly_modified()
117        self.directly_modified = directly_indirectly_modified[0]
118        self.indirectly_modified = directly_indirectly_modified[1]
119
120        self._categorize_snapshots()
121
122        self._categorized: t.Optional[t.List[Snapshot]] = None
123        self._uncategorized: t.Optional[t.List[Snapshot]] = None
124
125    @property
126    def categorized(self) -> t.List[Snapshot]:
127        """Returns the already categorized snapshots."""
128        if self._categorized is None:
129            self._categorized = [s for s in self.directly_modified if s.version]
130        return self._categorized
131
132    @property
133    def uncategorized(self) -> t.List[Snapshot]:
134        """Returns the uncategorized snapshots."""
135        if self._uncategorized is None:
136            self._uncategorized = [s for s in self.directly_modified if not s.version]
137        return self._uncategorized
138
139    @property
140    def start(self) -> TimeLike:
141        """Returns the start of the plan or the earliest date of all snapshots."""
142        return self._start or (
143            min(
144                start
145                for intervals_per_model in self._missing_intervals.values()
146                for start, _ in intervals_per_model
147            )
148            if self._missing_intervals
149            else yesterday_ds()
150        )
151
152    @start.setter
153    def start(self, new_start: TimeLike) -> None:
154        self._ensure_valid_date_range(new_start, self._end)
155        self.set_start(new_start)
156
157    def set_start(self, new_start: TimeLike) -> None:
158        self._start = new_start
159        self.__missing_intervals = None
160
161    @property
162    def end(self) -> TimeLike:
163        """Returns the end of the plan or now."""
164        return self._end or now()
165
166    @end.setter
167    def end(self, new_end: TimeLike) -> None:
168        self._ensure_valid_date_range(self._start, new_end)
169        self._end = new_end
170        self.__missing_intervals = None
171
172    @property
173    def is_start_and_end_allowed(self) -> bool:
174        """Indicates whether this plan allows to set the start and end dates."""
175        return self.is_dev or bool(self.restatements)
176
177    @property
178    def requires_backfill(self) -> bool:
179        return not self.skip_backfill and (bool(self.restatements) or bool(self.missing_intervals))
180
181    @property
182    def missing_intervals(self) -> t.List[MissingIntervals]:
183        """Returns a list of missing intervals."""
184        return [
185            MissingIntervals(
186                snapshot_name=snapshot.name,
187                intervals=self._missing_intervals[snapshot.version_get_or_generate()],
188            )
189            for snapshot in self.snapshots
190            if snapshot.version_get_or_generate() in self._missing_intervals
191        ]
192
193    @property
194    def snapshots(self) -> t.List[Snapshot]:
195        """Gets all the snapshots in the plan/environment."""
196        return list(self.context_diff.snapshots.values())
197
198    @property
199    def new_snapshots(self) -> t.List[Snapshot]:
200        """Gets only new snapshots in the plan/environment."""
201        return list(self.context_diff.new_snapshots.values())
202
203    @property
204    def environment(self) -> Environment:
205        """The environment of the plan."""
206        expiration_ts = (
207            to_timestamp(self.environment_ttl, relative_base=now())
208            if self.is_dev and self.environment_ttl is not None
209            else None
210        )
211        return Environment(
212            name=self.context_diff.environment,
213            snapshots=[snapshot.table_info for snapshot in self.snapshots],
214            start_at=self.start,
215            end_at=self._end,
216            plan_id=self.plan_id,
217            previous_plan_id=self.context_diff.previous_plan_id,
218            expiration_ts=expiration_ts,
219        )
220
221    @property
222    def restatements(self) -> t.Set[str]:
223        return self._restatements
224
225    def is_new_snapshot(self, snapshot: Snapshot) -> bool:
226        """Returns True if the given snapshot is a new snapshot in this plan."""
227        return snapshot.snapshot_id in self.context_diff.new_snapshots
228
229    def apply(self) -> None:
230        """Runs apply if an apply function was passed in."""
231        if not self._apply:
232            raise SQLMeshError(f"Plan was not initialized with an applier.")
233        validate_date_range(self.start, self.end)
234        self._apply(self)
235
236    def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None:
237        """Sets a snapshot version based on the user choice.
238
239        Args:
240            snapshot: The target snapshot.
241            choice: The user decision on how to version the target snapshot and its children.
242        """
243        if self.forward_only:
244            raise PlanError("Choice setting is not supported by a forward-only plan.")
245        if not self.is_new_snapshot(snapshot):
246            raise SQLMeshError(
247                f"A choice can't be changed for the existing version of model '{snapshot.name}'."
248            )
249
250        snapshot.change_category = choice
251        if choice in (
252            SnapshotChangeCategory.BREAKING,
253            SnapshotChangeCategory.NON_BREAKING,
254        ):
255            snapshot.set_version()
256        else:
257            snapshot.set_version(snapshot.previous_version)
258
259        for child in self.indirectly_modified[snapshot.name]:
260            child_snapshot = self.context_diff.snapshots[child]
261
262            if choice == SnapshotChangeCategory.BREAKING:
263                child_snapshot.set_version()
264            else:
265                child_snapshot.set_version(child_snapshot.previous_version)
266            snapshot.indirect_versions[child] = child_snapshot.all_versions
267
268            # If any other snapshot specified breaking this child, then that child
269            # needs to be backfilled as a part of the plan.
270            for upstream in self.directly_modified:
271                if child in upstream.indirect_versions:
272                    data_version = upstream.indirect_versions[child][-1]
273                    if data_version.is_new_version:
274                        child_snapshot.set_version()
275                        break
276
277        # Invalidate caches.
278        self._categorized = None
279        self._uncategorized = None
280
281    def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory:
282        """
283        Determines the SnapshotChangeCategory for a modified snapshot using its available history.
284
285        A snapshot may be modified (directly or indirectly) multiple times. Each time
286        it is directly changed, the categorization is stored in its history. Look
287        through the snapshot's history to find where it deviated from the previous
288        snapshot and then find the most conservative categorization recorded.
289
290        Args:
291            snapshot: The snapshot within this plan
292        """
293        if snapshot not in self.snapshots:
294            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.")
295
296        if not snapshot.version:
297            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.")
298
299        if snapshot.name not in self.context_diff.modified_snapshots:
300            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.")
301
302        current, previous = self.context_diff.modified_snapshots[snapshot.name]
303        if current.version == previous.version:
304            # Versions match, so no further history to check
305            return SnapshotChangeCategory.FORWARD_ONLY
306        elif previous.data_version in current.all_versions:
307            # Previous snapshot in the current snapshot's history. Get all versions
308            # since the two matched.
309            index = current.all_versions.index(previous.data_version)
310            versions = current.all_versions[index + 1 :]
311        elif current.data_version in previous.all_versions:
312            # Snapshot is a revert. Look through the previous snapshot's history
313            # and get all versions since it matched the current snapshot.
314            index = previous.all_versions.index(current.data_version)
315            versions = previous.all_versions[index:]
316        else:
317            # Insufficient history, so err on the side of safety
318            return SnapshotChangeCategory.BREAKING
319
320        change_categories = [
321            version.change_category for version in versions if version.change_category
322        ]
323        # Return the most conservative categorization found in the snapshot's history
324        return min(change_categories, key=lambda x: x.value)
325
326    @property
327    def _missing_intervals(self) -> t.Dict[str, Intervals]:
328        if self.__missing_intervals is None:
329            previous_ids = [
330                SnapshotId(
331                    name=snapshot.name,
332                    identifier=snapshot.previous_version.fingerprint.to_identifier(),
333                )
334                for snapshot in self.snapshots
335                if snapshot.previous_version
336            ]
337
338            previous_snapshots = (
339                list(self._state_reader.get_snapshots(previous_ids).values())
340                if previous_ids
341                else []
342            )
343
344            end = self.end
345            self.__missing_intervals = {
346                snapshot.version_get_or_generate(): missing
347                for snapshot, missing in self._state_reader.missing_intervals(
348                    previous_snapshots + list(self.snapshots),
349                    start=self._start or scheduler.earliest_start_date(self.snapshots),
350                    end=end,
351                    latest=end,
352                    restatements=self.restatements,
353                ).items()
354            }
355
356        return self.__missing_intervals
357
358    def _add_restatements(self, restate_models: t.Iterable[str]) -> None:
359        for table in restate_models:
360            downstream = self._dag.downstream(table)
361            if table in self.context_diff.snapshots:
362                downstream.append(table)
363
364            snapshots = self.context_diff.snapshots
365            downstream = [d for d in downstream if snapshots[d].is_materialized]
366
367            if not downstream:
368                raise PlanError(
369                    f"Cannot restate from '{table}'. Either such model doesn't exist or no other model references it."
370                )
371            self._restatements.update(downstream)
372
373    def _build_directly_and_indirectly_modified(self) -> t.Tuple[t.List[Snapshot], SnapshotMapping]:
374        """Builds collections of directly and inderectly modified snapshots.
375
376        Returns:
377            The tuple in which the first element contains a list of added and directly modified
378            snapshots while the second element contains a mapping of indirectly modified snapshots.
379        """
380        directly_modified = []
381        all_indirectly_modified = set()
382
383        for model_name, snapshot in self.context_diff.snapshots.items():
384            if model_name in self.context_diff.modified_snapshots:
385                if self.context_diff.directly_modified(model_name):
386                    directly_modified.append(snapshot)
387                else:
388                    all_indirectly_modified.add(model_name)
389            elif model_name in self.context_diff.added:
390                directly_modified.append(snapshot)
391
392        indirectly_modified: SnapshotMapping = defaultdict(set)
393        for snapshot in directly_modified:
394            for downstream in self._dag.downstream(snapshot.name):
395                if downstream in all_indirectly_modified:
396                    indirectly_modified[snapshot.name].add(downstream)
397
398        return (
399            directly_modified,
400            indirectly_modified,
401        )
402
403    def _categorize_snapshots(self) -> None:
404        """Automatically categorizes snapshots that can be automatically categorized and
405        returns a list of added and directly modified snapshots as well as the mapping of
406        indirectly modified snapshots.
407        """
408        for model_name, snapshot in self.context_diff.snapshots.items():
409            upstream_model_names = self._dag.upstream(model_name)
410
411            if not self.forward_only:
412                self._ensure_no_paused_forward_only_upstream(model_name, upstream_model_names)
413
414            if model_name in self.context_diff.modified_snapshots:
415                is_directly_modified = self.context_diff.directly_modified(model_name)
416
417                if self.is_new_snapshot(snapshot):
418                    if self.forward_only:
419                        # In case of the forward only plan any modifications result in reuse of the
420                        # previous version for non-seed models.
421                        # New snapshots of seed models are considered non-breaking ones.
422                        if not snapshot.is_seed_kind:
423                            snapshot.set_version(snapshot.previous_version)
424                            snapshot.change_category = SnapshotChangeCategory.FORWARD_ONLY
425                        else:
426                            snapshot.set_version()
427                            snapshot.change_category = SnapshotChangeCategory.NON_BREAKING
428                    elif self.auto_categorization_enabled and is_directly_modified:
429                        new, old = self.context_diff.modified_snapshots[model_name]
430                        change_category = categorize_change(
431                            new, old, config=self.categorizer_config
432                        )
433                        if change_category is not None:
434                            self.set_choice(new, change_category)
435
436                # set to breaking if an indirect child has no directly modified parents
437                # that need a decision. this can happen when a revert to a parent causes
438                # an indirectly modified snapshot to be created because of a new parent
439                if (
440                    not is_directly_modified
441                    and not snapshot.version
442                    and not any(
443                        self.context_diff.directly_modified(upstream)
444                        and not self.context_diff.snapshots[upstream].version
445                        for upstream in upstream_model_names
446                    )
447                ):
448                    snapshot.set_version()
449
450            elif model_name in self.context_diff.added and self.is_new_snapshot(snapshot):
451                snapshot.set_version()
452
453    def _ensure_no_paused_forward_only_upstream(
454        self, model_name: str, upstream_model_names: t.Iterable[str]
455    ) -> None:
456        for upstream in upstream_model_names:
457            upstream_snapshot = self.context_diff.snapshots.get(upstream)
458            if (
459                upstream_snapshot
460                and upstream_snapshot.version
461                and upstream_snapshot.is_forward_only
462                and upstream_snapshot.is_paused
463            ):
464                raise PlanError(
465                    f"Model '{model_name}' depends on a paused version of model '{upstream}'. "
466                    "Possible remedies: "
467                    "1) make sure your codebase is up-to-date; "
468                    f"2) promote the current version of model '{upstream}' in the production environment; "
469                    "3) recreate this plan in a forward-only mode."
470                )
471
472    def _ensure_valid_date_range(
473        self, start: t.Optional[TimeLike], end: t.Optional[TimeLike]
474    ) -> None:
475        if (start or end) and not self.is_start_and_end_allowed:
476            raise PlanError(
477                "The start and end dates can't be set for a production plan without restatements."
478            )
479
480    def _ensure_no_forward_only_revert(self) -> None:
481        """Ensures that a previously superseded breaking / non-breaking snapshot is not being
482        used again to replace an existing forward-only snapshot with the same version.
483
484        In other words there is no going back to the original non-forward-only snapshot with
485        the same version once a forward-only change for that version has been introduced.
486        """
487        for name, (candidate, promoted) in self.context_diff.modified_snapshots.items():
488            if (
489                candidate.snapshot_id not in self.context_diff.new_snapshots
490                and promoted.is_forward_only
491                and not candidate.is_forward_only
492                and (
493                    promoted.version == candidate.version
494                    or candidate.data_version in promoted.previous_versions
495                )
496            ):
497                raise PlanError(
498                    f"Detected an existing version of model '{name}' that has been previously superseded by a forward-only change. "
499                    "To proceed with the change, restamp this model's definition to produce a new version."
500                )
501
502    def _ensure_no_forward_only_new_models(self) -> None:
503        if self.forward_only and self.context_diff.added:
504            raise PlanError("New models can't be added as part of the forward-only plan.")
505
506
507class PlanStatus(str, Enum):
508    STARTED = "started"
509    FINISHED = "finished"
510    FAILED = "failed"
511
512    @property
513    def is_started(self) -> bool:
514        return self == PlanStatus.STARTED
515
516    @property
517    def is_failed(self) -> bool:
518        return self == PlanStatus.FAILED
519
520    @property
521    def is_finished(self) -> bool:
522        return self == PlanStatus.FINISHED
523
524
525class MissingIntervals(PydanticModel, frozen=True):
526    snapshot_name: str
527    intervals: Intervals
528
529    @property
530    def merged_intervals(self) -> Intervals:
531        return merge_intervals(self.intervals)
532
533    def format_missing_range(self) -> str:
534        intervals = [make_inclusive(start, end) for start, end in self.merged_intervals]
535        return ", ".join(f"({to_ds(start)}, {to_ds(end)})" for start, end in intervals)
class Plan:
 38class Plan:
 39    """Plan is the main class to represent user choices on how they want to backfill and version their models.
 40
 41    Args:
 42        context_diff: The context diff that the plan is based on.
 43        dag: The dag object to determine relationships.
 44        state_reader: The state_reader to get metadata with.
 45        start: The start time to backfill data.
 46        end: The end time to backfill data.
 47        apply: The callback to apply the plan.
 48        restate_models: A list of models for which the data should be restated for the time range
 49            specified in this plan. Note: models defined outside SQLMesh (external) won't be a part
 50            of the restatement.
 51        no_gaps:  Whether to ensure that new snapshots for models that are already a
 52            part of the target environment have no data gaps when compared against previous
 53            snapshots for same models.
 54        skip_backfill: Whether to skip the backfill step.
 55        is_dev: Whether this plan is for development purposes.
 56        forward_only: Whether the purpose of the plan is to make forward only changes.
 57        environment_ttl: The period of time that a development environment should exist before being deleted.
 58        categorizer_config: Auto categorization settings.
 59        auto_categorization_enabled: Whether to apply auto categorization.
 60    """
 61
 62    def __init__(
 63        self,
 64        context_diff: ContextDiff,
 65        dag: DAG,
 66        state_reader: StateReader,
 67        start: t.Optional[TimeLike] = None,
 68        end: t.Optional[TimeLike] = None,
 69        apply: t.Optional[t.Callable[[Plan], None]] = None,
 70        restate_models: t.Optional[t.Iterable[str]] = None,
 71        no_gaps: bool = False,
 72        skip_backfill: bool = False,
 73        is_dev: bool = False,
 74        forward_only: bool = False,
 75        environment_ttl: t.Optional[str] = None,
 76        categorizer_config: t.Optional[CategorizerConfig] = None,
 77        auto_categorization_enabled: bool = True,
 78    ):
 79        self.context_diff = context_diff
 80        self.override_start = start is not None
 81        self.override_end = end is not None
 82        self.plan_id: str = random_id()
 83        self.no_gaps = no_gaps
 84        self.skip_backfill = skip_backfill
 85        self.is_dev = is_dev
 86        self.forward_only = forward_only
 87        self.environment_ttl = environment_ttl
 88        self.categorizer_config = categorizer_config or CategorizerConfig()
 89        self.auto_categorization_enabled = auto_categorization_enabled
 90        self._start = start if start or not (is_dev and forward_only) else yesterday_ds()
 91        self._end = end if end or not is_dev else now()
 92        self._apply = apply
 93        self._dag = dag
 94        self._state_reader = state_reader
 95        self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None
 96        self._restatements: t.Set[str] = set()
 97
 98        if restate_models and context_diff.new_snapshots:
 99            raise PlanError(
100                "Model changes and restatements can't be a part of the same plan. "
101                "Revert or apply changes before proceeding with restatements."
102            )
103
104        if not restate_models and is_dev and forward_only:
105            # Add model names for new forward-only snapshots to the restatement list
106            # in order to compute previews.
107            restate_models = [
108                s.name for s in context_diff.new_snapshots.values() if s.is_materialized
109            ]
110
111        self._add_restatements(restate_models or [])
112
113        self._ensure_valid_date_range(self._start, self._end)
114        self._ensure_no_forward_only_revert()
115        self._ensure_no_forward_only_new_models()
116
117        directly_indirectly_modified = self._build_directly_and_indirectly_modified()
118        self.directly_modified = directly_indirectly_modified[0]
119        self.indirectly_modified = directly_indirectly_modified[1]
120
121        self._categorize_snapshots()
122
123        self._categorized: t.Optional[t.List[Snapshot]] = None
124        self._uncategorized: t.Optional[t.List[Snapshot]] = None
125
126    @property
127    def categorized(self) -> t.List[Snapshot]:
128        """Returns the already categorized snapshots."""
129        if self._categorized is None:
130            self._categorized = [s for s in self.directly_modified if s.version]
131        return self._categorized
132
133    @property
134    def uncategorized(self) -> t.List[Snapshot]:
135        """Returns the uncategorized snapshots."""
136        if self._uncategorized is None:
137            self._uncategorized = [s for s in self.directly_modified if not s.version]
138        return self._uncategorized
139
140    @property
141    def start(self) -> TimeLike:
142        """Returns the start of the plan or the earliest date of all snapshots."""
143        return self._start or (
144            min(
145                start
146                for intervals_per_model in self._missing_intervals.values()
147                for start, _ in intervals_per_model
148            )
149            if self._missing_intervals
150            else yesterday_ds()
151        )
152
153    @start.setter
154    def start(self, new_start: TimeLike) -> None:
155        self._ensure_valid_date_range(new_start, self._end)
156        self.set_start(new_start)
157
158    def set_start(self, new_start: TimeLike) -> None:
159        self._start = new_start
160        self.__missing_intervals = None
161
162    @property
163    def end(self) -> TimeLike:
164        """Returns the end of the plan or now."""
165        return self._end or now()
166
167    @end.setter
168    def end(self, new_end: TimeLike) -> None:
169        self._ensure_valid_date_range(self._start, new_end)
170        self._end = new_end
171        self.__missing_intervals = None
172
173    @property
174    def is_start_and_end_allowed(self) -> bool:
175        """Indicates whether this plan allows to set the start and end dates."""
176        return self.is_dev or bool(self.restatements)
177
178    @property
179    def requires_backfill(self) -> bool:
180        return not self.skip_backfill and (bool(self.restatements) or bool(self.missing_intervals))
181
182    @property
183    def missing_intervals(self) -> t.List[MissingIntervals]:
184        """Returns a list of missing intervals."""
185        return [
186            MissingIntervals(
187                snapshot_name=snapshot.name,
188                intervals=self._missing_intervals[snapshot.version_get_or_generate()],
189            )
190            for snapshot in self.snapshots
191            if snapshot.version_get_or_generate() in self._missing_intervals
192        ]
193
194    @property
195    def snapshots(self) -> t.List[Snapshot]:
196        """Gets all the snapshots in the plan/environment."""
197        return list(self.context_diff.snapshots.values())
198
199    @property
200    def new_snapshots(self) -> t.List[Snapshot]:
201        """Gets only new snapshots in the plan/environment."""
202        return list(self.context_diff.new_snapshots.values())
203
204    @property
205    def environment(self) -> Environment:
206        """The environment of the plan."""
207        expiration_ts = (
208            to_timestamp(self.environment_ttl, relative_base=now())
209            if self.is_dev and self.environment_ttl is not None
210            else None
211        )
212        return Environment(
213            name=self.context_diff.environment,
214            snapshots=[snapshot.table_info for snapshot in self.snapshots],
215            start_at=self.start,
216            end_at=self._end,
217            plan_id=self.plan_id,
218            previous_plan_id=self.context_diff.previous_plan_id,
219            expiration_ts=expiration_ts,
220        )
221
222    @property
223    def restatements(self) -> t.Set[str]:
224        return self._restatements
225
226    def is_new_snapshot(self, snapshot: Snapshot) -> bool:
227        """Returns True if the given snapshot is a new snapshot in this plan."""
228        return snapshot.snapshot_id in self.context_diff.new_snapshots
229
230    def apply(self) -> None:
231        """Runs apply if an apply function was passed in."""
232        if not self._apply:
233            raise SQLMeshError(f"Plan was not initialized with an applier.")
234        validate_date_range(self.start, self.end)
235        self._apply(self)
236
237    def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None:
238        """Sets a snapshot version based on the user choice.
239
240        Args:
241            snapshot: The target snapshot.
242            choice: The user decision on how to version the target snapshot and its children.
243        """
244        if self.forward_only:
245            raise PlanError("Choice setting is not supported by a forward-only plan.")
246        if not self.is_new_snapshot(snapshot):
247            raise SQLMeshError(
248                f"A choice can't be changed for the existing version of model '{snapshot.name}'."
249            )
250
251        snapshot.change_category = choice
252        if choice in (
253            SnapshotChangeCategory.BREAKING,
254            SnapshotChangeCategory.NON_BREAKING,
255        ):
256            snapshot.set_version()
257        else:
258            snapshot.set_version(snapshot.previous_version)
259
260        for child in self.indirectly_modified[snapshot.name]:
261            child_snapshot = self.context_diff.snapshots[child]
262
263            if choice == SnapshotChangeCategory.BREAKING:
264                child_snapshot.set_version()
265            else:
266                child_snapshot.set_version(child_snapshot.previous_version)
267            snapshot.indirect_versions[child] = child_snapshot.all_versions
268
269            # If any other snapshot specified breaking this child, then that child
270            # needs to be backfilled as a part of the plan.
271            for upstream in self.directly_modified:
272                if child in upstream.indirect_versions:
273                    data_version = upstream.indirect_versions[child][-1]
274                    if data_version.is_new_version:
275                        child_snapshot.set_version()
276                        break
277
278        # Invalidate caches.
279        self._categorized = None
280        self._uncategorized = None
281
282    def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory:
283        """
284        Determines the SnapshotChangeCategory for a modified snapshot using its available history.
285
286        A snapshot may be modified (directly or indirectly) multiple times. Each time
287        it is directly changed, the categorization is stored in its history. Look
288        through the snapshot's history to find where it deviated from the previous
289        snapshot and then find the most conservative categorization recorded.
290
291        Args:
292            snapshot: The snapshot within this plan
293        """
294        if snapshot not in self.snapshots:
295            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.")
296
297        if not snapshot.version:
298            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.")
299
300        if snapshot.name not in self.context_diff.modified_snapshots:
301            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.")
302
303        current, previous = self.context_diff.modified_snapshots[snapshot.name]
304        if current.version == previous.version:
305            # Versions match, so no further history to check
306            return SnapshotChangeCategory.FORWARD_ONLY
307        elif previous.data_version in current.all_versions:
308            # Previous snapshot in the current snapshot's history. Get all versions
309            # since the two matched.
310            index = current.all_versions.index(previous.data_version)
311            versions = current.all_versions[index + 1 :]
312        elif current.data_version in previous.all_versions:
313            # Snapshot is a revert. Look through the previous snapshot's history
314            # and get all versions since it matched the current snapshot.
315            index = previous.all_versions.index(current.data_version)
316            versions = previous.all_versions[index:]
317        else:
318            # Insufficient history, so err on the side of safety
319            return SnapshotChangeCategory.BREAKING
320
321        change_categories = [
322            version.change_category for version in versions if version.change_category
323        ]
324        # Return the most conservative categorization found in the snapshot's history
325        return min(change_categories, key=lambda x: x.value)
326
327    @property
328    def _missing_intervals(self) -> t.Dict[str, Intervals]:
329        if self.__missing_intervals is None:
330            previous_ids = [
331                SnapshotId(
332                    name=snapshot.name,
333                    identifier=snapshot.previous_version.fingerprint.to_identifier(),
334                )
335                for snapshot in self.snapshots
336                if snapshot.previous_version
337            ]
338
339            previous_snapshots = (
340                list(self._state_reader.get_snapshots(previous_ids).values())
341                if previous_ids
342                else []
343            )
344
345            end = self.end
346            self.__missing_intervals = {
347                snapshot.version_get_or_generate(): missing
348                for snapshot, missing in self._state_reader.missing_intervals(
349                    previous_snapshots + list(self.snapshots),
350                    start=self._start or scheduler.earliest_start_date(self.snapshots),
351                    end=end,
352                    latest=end,
353                    restatements=self.restatements,
354                ).items()
355            }
356
357        return self.__missing_intervals
358
359    def _add_restatements(self, restate_models: t.Iterable[str]) -> None:
360        for table in restate_models:
361            downstream = self._dag.downstream(table)
362            if table in self.context_diff.snapshots:
363                downstream.append(table)
364
365            snapshots = self.context_diff.snapshots
366            downstream = [d for d in downstream if snapshots[d].is_materialized]
367
368            if not downstream:
369                raise PlanError(
370                    f"Cannot restate from '{table}'. Either such model doesn't exist or no other model references it."
371                )
372            self._restatements.update(downstream)
373
374    def _build_directly_and_indirectly_modified(self) -> t.Tuple[t.List[Snapshot], SnapshotMapping]:
375        """Builds collections of directly and inderectly modified snapshots.
376
377        Returns:
378            The tuple in which the first element contains a list of added and directly modified
379            snapshots while the second element contains a mapping of indirectly modified snapshots.
380        """
381        directly_modified = []
382        all_indirectly_modified = set()
383
384        for model_name, snapshot in self.context_diff.snapshots.items():
385            if model_name in self.context_diff.modified_snapshots:
386                if self.context_diff.directly_modified(model_name):
387                    directly_modified.append(snapshot)
388                else:
389                    all_indirectly_modified.add(model_name)
390            elif model_name in self.context_diff.added:
391                directly_modified.append(snapshot)
392
393        indirectly_modified: SnapshotMapping = defaultdict(set)
394        for snapshot in directly_modified:
395            for downstream in self._dag.downstream(snapshot.name):
396                if downstream in all_indirectly_modified:
397                    indirectly_modified[snapshot.name].add(downstream)
398
399        return (
400            directly_modified,
401            indirectly_modified,
402        )
403
404    def _categorize_snapshots(self) -> None:
405        """Automatically categorizes snapshots that can be automatically categorized and
406        returns a list of added and directly modified snapshots as well as the mapping of
407        indirectly modified snapshots.
408        """
409        for model_name, snapshot in self.context_diff.snapshots.items():
410            upstream_model_names = self._dag.upstream(model_name)
411
412            if not self.forward_only:
413                self._ensure_no_paused_forward_only_upstream(model_name, upstream_model_names)
414
415            if model_name in self.context_diff.modified_snapshots:
416                is_directly_modified = self.context_diff.directly_modified(model_name)
417
418                if self.is_new_snapshot(snapshot):
419                    if self.forward_only:
420                        # In case of the forward only plan any modifications result in reuse of the
421                        # previous version for non-seed models.
422                        # New snapshots of seed models are considered non-breaking ones.
423                        if not snapshot.is_seed_kind:
424                            snapshot.set_version(snapshot.previous_version)
425                            snapshot.change_category = SnapshotChangeCategory.FORWARD_ONLY
426                        else:
427                            snapshot.set_version()
428                            snapshot.change_category = SnapshotChangeCategory.NON_BREAKING
429                    elif self.auto_categorization_enabled and is_directly_modified:
430                        new, old = self.context_diff.modified_snapshots[model_name]
431                        change_category = categorize_change(
432                            new, old, config=self.categorizer_config
433                        )
434                        if change_category is not None:
435                            self.set_choice(new, change_category)
436
437                # set to breaking if an indirect child has no directly modified parents
438                # that need a decision. this can happen when a revert to a parent causes
439                # an indirectly modified snapshot to be created because of a new parent
440                if (
441                    not is_directly_modified
442                    and not snapshot.version
443                    and not any(
444                        self.context_diff.directly_modified(upstream)
445                        and not self.context_diff.snapshots[upstream].version
446                        for upstream in upstream_model_names
447                    )
448                ):
449                    snapshot.set_version()
450
451            elif model_name in self.context_diff.added and self.is_new_snapshot(snapshot):
452                snapshot.set_version()
453
454    def _ensure_no_paused_forward_only_upstream(
455        self, model_name: str, upstream_model_names: t.Iterable[str]
456    ) -> None:
457        for upstream in upstream_model_names:
458            upstream_snapshot = self.context_diff.snapshots.get(upstream)
459            if (
460                upstream_snapshot
461                and upstream_snapshot.version
462                and upstream_snapshot.is_forward_only
463                and upstream_snapshot.is_paused
464            ):
465                raise PlanError(
466                    f"Model '{model_name}' depends on a paused version of model '{upstream}'. "
467                    "Possible remedies: "
468                    "1) make sure your codebase is up-to-date; "
469                    f"2) promote the current version of model '{upstream}' in the production environment; "
470                    "3) recreate this plan in a forward-only mode."
471                )
472
473    def _ensure_valid_date_range(
474        self, start: t.Optional[TimeLike], end: t.Optional[TimeLike]
475    ) -> None:
476        if (start or end) and not self.is_start_and_end_allowed:
477            raise PlanError(
478                "The start and end dates can't be set for a production plan without restatements."
479            )
480
481    def _ensure_no_forward_only_revert(self) -> None:
482        """Ensures that a previously superseded breaking / non-breaking snapshot is not being
483        used again to replace an existing forward-only snapshot with the same version.
484
485        In other words there is no going back to the original non-forward-only snapshot with
486        the same version once a forward-only change for that version has been introduced.
487        """
488        for name, (candidate, promoted) in self.context_diff.modified_snapshots.items():
489            if (
490                candidate.snapshot_id not in self.context_diff.new_snapshots
491                and promoted.is_forward_only
492                and not candidate.is_forward_only
493                and (
494                    promoted.version == candidate.version
495                    or candidate.data_version in promoted.previous_versions
496                )
497            ):
498                raise PlanError(
499                    f"Detected an existing version of model '{name}' that has been previously superseded by a forward-only change. "
500                    "To proceed with the change, restamp this model's definition to produce a new version."
501                )
502
503    def _ensure_no_forward_only_new_models(self) -> None:
504        if self.forward_only and self.context_diff.added:
505            raise PlanError("New models can't be added as part of the forward-only plan.")

Plan is the main class to represent user choices on how they want to backfill and version their models.

Arguments:
  • context_diff: The context diff that the plan is based on.
  • dag: The dag object to determine relationships.
  • state_reader: The state_reader to get metadata with.
  • start: The start time to backfill data.
  • end: The end time to backfill data.
  • apply: The callback to apply the plan.
  • restate_models: A list of models for which the data should be restated for the time range specified in this plan. Note: models defined outside SQLMesh (external) won't be a part of the restatement.
  • no_gaps: Whether to ensure that new snapshots for models that are already a part of the target environment have no data gaps when compared against previous snapshots for same models.
  • skip_backfill: Whether to skip the backfill step.
  • is_dev: Whether this plan is for development purposes.
  • forward_only: Whether the purpose of the plan is to make forward only changes.
  • environment_ttl: The period of time that a development environment should exist before being deleted.
  • categorizer_config: Auto categorization settings.
  • auto_categorization_enabled: Whether to apply auto categorization.
Plan( context_diff: sqlmesh.core.context_diff.ContextDiff, dag: sqlmesh.utils.dag.DAG, state_reader: sqlmesh.core.state_sync.base.StateReader, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, apply: Optional[Callable[[sqlmesh.core.plan.definition.Plan], NoneType]] = None, restate_models: Optional[Iterable[str]] = None, no_gaps: bool = False, skip_backfill: bool = False, is_dev: bool = False, forward_only: bool = False, environment_ttl: Optional[str] = None, categorizer_config: Optional[sqlmesh.core.config.categorizer.CategorizerConfig] = None, auto_categorization_enabled: bool = True)
 62    def __init__(
 63        self,
 64        context_diff: ContextDiff,
 65        dag: DAG,
 66        state_reader: StateReader,
 67        start: t.Optional[TimeLike] = None,
 68        end: t.Optional[TimeLike] = None,
 69        apply: t.Optional[t.Callable[[Plan], None]] = None,
 70        restate_models: t.Optional[t.Iterable[str]] = None,
 71        no_gaps: bool = False,
 72        skip_backfill: bool = False,
 73        is_dev: bool = False,
 74        forward_only: bool = False,
 75        environment_ttl: t.Optional[str] = None,
 76        categorizer_config: t.Optional[CategorizerConfig] = None,
 77        auto_categorization_enabled: bool = True,
 78    ):
 79        self.context_diff = context_diff
 80        self.override_start = start is not None
 81        self.override_end = end is not None
 82        self.plan_id: str = random_id()
 83        self.no_gaps = no_gaps
 84        self.skip_backfill = skip_backfill
 85        self.is_dev = is_dev
 86        self.forward_only = forward_only
 87        self.environment_ttl = environment_ttl
 88        self.categorizer_config = categorizer_config or CategorizerConfig()
 89        self.auto_categorization_enabled = auto_categorization_enabled
 90        self._start = start if start or not (is_dev and forward_only) else yesterday_ds()
 91        self._end = end if end or not is_dev else now()
 92        self._apply = apply
 93        self._dag = dag
 94        self._state_reader = state_reader
 95        self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None
 96        self._restatements: t.Set[str] = set()
 97
 98        if restate_models and context_diff.new_snapshots:
 99            raise PlanError(
100                "Model changes and restatements can't be a part of the same plan. "
101                "Revert or apply changes before proceeding with restatements."
102            )
103
104        if not restate_models and is_dev and forward_only:
105            # Add model names for new forward-only snapshots to the restatement list
106            # in order to compute previews.
107            restate_models = [
108                s.name for s in context_diff.new_snapshots.values() if s.is_materialized
109            ]
110
111        self._add_restatements(restate_models or [])
112
113        self._ensure_valid_date_range(self._start, self._end)
114        self._ensure_no_forward_only_revert()
115        self._ensure_no_forward_only_new_models()
116
117        directly_indirectly_modified = self._build_directly_and_indirectly_modified()
118        self.directly_modified = directly_indirectly_modified[0]
119        self.indirectly_modified = directly_indirectly_modified[1]
120
121        self._categorize_snapshots()
122
123        self._categorized: t.Optional[t.List[Snapshot]] = None
124        self._uncategorized: t.Optional[t.List[Snapshot]] = None

Returns the already categorized snapshots.

Returns the uncategorized snapshots.

start: Union[datetime.date, datetime.datetime, str, int, float]

Returns the start of the plan or the earliest date of all snapshots.

def set_start( self, new_start: Union[datetime.date, datetime.datetime, str, int, float]) -> None:
158    def set_start(self, new_start: TimeLike) -> None:
159        self._start = new_start
160        self.__missing_intervals = None
end: Union[datetime.date, datetime.datetime, str, int, float]

Returns the end of the plan or now.

is_start_and_end_allowed: bool

Indicates whether this plan allows to set the start and end dates.

Returns a list of missing intervals.

Gets all the snapshots in the plan/environment.

Gets only new snapshots in the plan/environment.

The environment of the plan.

def is_new_snapshot(self, snapshot: sqlmesh.core.snapshot.definition.Snapshot) -> bool:
226    def is_new_snapshot(self, snapshot: Snapshot) -> bool:
227        """Returns True if the given snapshot is a new snapshot in this plan."""
228        return snapshot.snapshot_id in self.context_diff.new_snapshots

Returns True if the given snapshot is a new snapshot in this plan.

def apply(self) -> None:
230    def apply(self) -> None:
231        """Runs apply if an apply function was passed in."""
232        if not self._apply:
233            raise SQLMeshError(f"Plan was not initialized with an applier.")
234        validate_date_range(self.start, self.end)
235        self._apply(self)

Runs apply if an apply function was passed in.

def set_choice( self, snapshot: sqlmesh.core.snapshot.definition.Snapshot, choice: sqlmesh.core.snapshot.definition.SnapshotChangeCategory) -> None:
237    def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None:
238        """Sets a snapshot version based on the user choice.
239
240        Args:
241            snapshot: The target snapshot.
242            choice: The user decision on how to version the target snapshot and its children.
243        """
244        if self.forward_only:
245            raise PlanError("Choice setting is not supported by a forward-only plan.")
246        if not self.is_new_snapshot(snapshot):
247            raise SQLMeshError(
248                f"A choice can't be changed for the existing version of model '{snapshot.name}'."
249            )
250
251        snapshot.change_category = choice
252        if choice in (
253            SnapshotChangeCategory.BREAKING,
254            SnapshotChangeCategory.NON_BREAKING,
255        ):
256            snapshot.set_version()
257        else:
258            snapshot.set_version(snapshot.previous_version)
259
260        for child in self.indirectly_modified[snapshot.name]:
261            child_snapshot = self.context_diff.snapshots[child]
262
263            if choice == SnapshotChangeCategory.BREAKING:
264                child_snapshot.set_version()
265            else:
266                child_snapshot.set_version(child_snapshot.previous_version)
267            snapshot.indirect_versions[child] = child_snapshot.all_versions
268
269            # If any other snapshot specified breaking this child, then that child
270            # needs to be backfilled as a part of the plan.
271            for upstream in self.directly_modified:
272                if child in upstream.indirect_versions:
273                    data_version = upstream.indirect_versions[child][-1]
274                    if data_version.is_new_version:
275                        child_snapshot.set_version()
276                        break
277
278        # Invalidate caches.
279        self._categorized = None
280        self._uncategorized = None

Sets a snapshot version based on the user choice.

Arguments:
  • snapshot: The target snapshot.
  • choice: The user decision on how to version the target snapshot and its children.
def snapshot_change_category( self, snapshot: sqlmesh.core.snapshot.definition.Snapshot) -> sqlmesh.core.snapshot.definition.SnapshotChangeCategory:
282    def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory:
283        """
284        Determines the SnapshotChangeCategory for a modified snapshot using its available history.
285
286        A snapshot may be modified (directly or indirectly) multiple times. Each time
287        it is directly changed, the categorization is stored in its history. Look
288        through the snapshot's history to find where it deviated from the previous
289        snapshot and then find the most conservative categorization recorded.
290
291        Args:
292            snapshot: The snapshot within this plan
293        """
294        if snapshot not in self.snapshots:
295            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.")
296
297        if not snapshot.version:
298            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.")
299
300        if snapshot.name not in self.context_diff.modified_snapshots:
301            raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.")
302
303        current, previous = self.context_diff.modified_snapshots[snapshot.name]
304        if current.version == previous.version:
305            # Versions match, so no further history to check
306            return SnapshotChangeCategory.FORWARD_ONLY
307        elif previous.data_version in current.all_versions:
308            # Previous snapshot in the current snapshot's history. Get all versions
309            # since the two matched.
310            index = current.all_versions.index(previous.data_version)
311            versions = current.all_versions[index + 1 :]
312        elif current.data_version in previous.all_versions:
313            # Snapshot is a revert. Look through the previous snapshot's history
314            # and get all versions since it matched the current snapshot.
315            index = previous.all_versions.index(current.data_version)
316            versions = previous.all_versions[index:]
317        else:
318            # Insufficient history, so err on the side of safety
319            return SnapshotChangeCategory.BREAKING
320
321        change_categories = [
322            version.change_category for version in versions if version.change_category
323        ]
324        # Return the most conservative categorization found in the snapshot's history
325        return min(change_categories, key=lambda x: x.value)

Determines the SnapshotChangeCategory for a modified snapshot using its available history.

A snapshot may be modified (directly or indirectly) multiple times. Each time it is directly changed, the categorization is stored in its history. Look through the snapshot's history to find where it deviated from the previous snapshot and then find the most conservative categorization recorded.

Arguments:
  • snapshot: The snapshot within this plan
class PlanStatus(builtins.str, enum.Enum):
508class PlanStatus(str, Enum):
509    STARTED = "started"
510    FINISHED = "finished"
511    FAILED = "failed"
512
513    @property
514    def is_started(self) -> bool:
515        return self == PlanStatus.STARTED
516
517    @property
518    def is_failed(self) -> bool:
519        return self == PlanStatus.FAILED
520
521    @property
522    def is_finished(self) -> bool:
523        return self == PlanStatus.FINISHED

An enumeration.

STARTED = <PlanStatus.STARTED: 'started'>
FINISHED = <PlanStatus.FINISHED: 'finished'>
FAILED = <PlanStatus.FAILED: 'failed'>
Inherited Members
enum.Enum
name
value
builtins.str
encode
replace
split
rsplit
join
capitalize
casefold
title
center
count
expandtabs
find
partition
index
ljust
lower
lstrip
rfind
rindex
rjust
rstrip
rpartition
splitlines
strip
swapcase
translate
upper
startswith
endswith
removeprefix
removesuffix
isascii
islower
isupper
istitle
isspace
isdecimal
isdigit
isnumeric
isalpha
isalnum
isidentifier
isprintable
zfill
format
format_map
maketrans
class MissingIntervals(sqlmesh.utils.pydantic.PydanticModel):
526class MissingIntervals(PydanticModel, frozen=True):
527    snapshot_name: str
528    intervals: Intervals
529
530    @property
531    def merged_intervals(self) -> Intervals:
532        return merge_intervals(self.intervals)
533
534    def format_missing_range(self) -> str:
535        intervals = [make_inclusive(start, end) for start, end in self.merged_intervals]
536        return ", ".join(f"({to_ds(start)}, {to_ds(end)})" for start, end in intervals)
def format_missing_range(self) -> str:
534    def format_missing_range(self) -> str:
535        intervals = [make_inclusive(start, end) for start, end in self.merged_intervals]
536        return ", ".join(f"({to_ds(start)}, {to_ds(end)})" for start, end in intervals)
Inherited Members
pydantic.main.BaseModel
BaseModel
parse_obj
parse_raw
parse_file
from_orm
construct
copy
schema
schema_json
validate
update_forward_refs
sqlmesh.utils.pydantic.PydanticModel
Config
dict
json
missing_required_fields
extra_fields
all_fields
required_fields