sqlmesh.core.plan.definition
1from __future__ import annotations 2 3import typing as t 4from collections import defaultdict 5from enum import Enum 6 7from sqlmesh.core import scheduler 8from sqlmesh.core.config import CategorizerConfig 9from sqlmesh.core.context_diff import ContextDiff 10from sqlmesh.core.environment import Environment 11from sqlmesh.core.snapshot import ( 12 Intervals, 13 Snapshot, 14 SnapshotChangeCategory, 15 SnapshotId, 16 categorize_change, 17 merge_intervals, 18) 19from sqlmesh.core.state_sync import StateReader 20from sqlmesh.utils import random_id 21from sqlmesh.utils.dag import DAG 22from sqlmesh.utils.date import ( 23 TimeLike, 24 make_inclusive, 25 now, 26 to_ds, 27 to_timestamp, 28 validate_date_range, 29 yesterday_ds, 30) 31from sqlmesh.utils.errors import PlanError, SQLMeshError 32from sqlmesh.utils.pydantic import PydanticModel 33 34SnapshotMapping = t.Dict[str, t.Set[str]] 35 36 37class Plan: 38 """Plan is the main class to represent user choices on how they want to backfill and version their models. 39 40 Args: 41 context_diff: The context diff that the plan is based on. 42 dag: The dag object to determine relationships. 43 state_reader: The state_reader to get metadata with. 44 start: The start time to backfill data. 45 end: The end time to backfill data. 46 apply: The callback to apply the plan. 47 restate_models: A list of models for which the data should be restated for the time range 48 specified in this plan. Note: models defined outside SQLMesh (external) won't be a part 49 of the restatement. 50 no_gaps: Whether to ensure that new snapshots for models that are already a 51 part of the target environment have no data gaps when compared against previous 52 snapshots for same models. 53 skip_backfill: Whether to skip the backfill step. 54 is_dev: Whether this plan is for development purposes. 55 forward_only: Whether the purpose of the plan is to make forward only changes. 56 environment_ttl: The period of time that a development environment should exist before being deleted. 57 categorizer_config: Auto categorization settings. 58 auto_categorization_enabled: Whether to apply auto categorization. 59 """ 60 61 def __init__( 62 self, 63 context_diff: ContextDiff, 64 dag: DAG, 65 state_reader: StateReader, 66 start: t.Optional[TimeLike] = None, 67 end: t.Optional[TimeLike] = None, 68 apply: t.Optional[t.Callable[[Plan], None]] = None, 69 restate_models: t.Optional[t.Iterable[str]] = None, 70 no_gaps: bool = False, 71 skip_backfill: bool = False, 72 is_dev: bool = False, 73 forward_only: bool = False, 74 environment_ttl: t.Optional[str] = None, 75 categorizer_config: t.Optional[CategorizerConfig] = None, 76 auto_categorization_enabled: bool = True, 77 ): 78 self.context_diff = context_diff 79 self.override_start = start is not None 80 self.override_end = end is not None 81 self.plan_id: str = random_id() 82 self.no_gaps = no_gaps 83 self.skip_backfill = skip_backfill 84 self.is_dev = is_dev 85 self.forward_only = forward_only 86 self.environment_ttl = environment_ttl 87 self.categorizer_config = categorizer_config or CategorizerConfig() 88 self.auto_categorization_enabled = auto_categorization_enabled 89 self._start = start if start or not (is_dev and forward_only) else yesterday_ds() 90 self._end = end if end or not is_dev else now() 91 self._apply = apply 92 self._dag = dag 93 self._state_reader = state_reader 94 self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None 95 self._restatements: t.Set[str] = set() 96 97 if restate_models and context_diff.new_snapshots: 98 raise PlanError( 99 "Model changes and restatements can't be a part of the same plan. " 100 "Revert or apply changes before proceeding with restatements." 101 ) 102 103 if not restate_models and is_dev and forward_only: 104 # Add model names for new forward-only snapshots to the restatement list 105 # in order to compute previews. 106 restate_models = [ 107 s.name for s in context_diff.new_snapshots.values() if s.is_materialized 108 ] 109 110 self._add_restatements(restate_models or []) 111 112 self._ensure_valid_date_range(self._start, self._end) 113 self._ensure_no_forward_only_revert() 114 self._ensure_no_forward_only_new_models() 115 116 directly_indirectly_modified = self._build_directly_and_indirectly_modified() 117 self.directly_modified = directly_indirectly_modified[0] 118 self.indirectly_modified = directly_indirectly_modified[1] 119 120 self._categorize_snapshots() 121 122 self._categorized: t.Optional[t.List[Snapshot]] = None 123 self._uncategorized: t.Optional[t.List[Snapshot]] = None 124 125 @property 126 def categorized(self) -> t.List[Snapshot]: 127 """Returns the already categorized snapshots.""" 128 if self._categorized is None: 129 self._categorized = [s for s in self.directly_modified if s.version] 130 return self._categorized 131 132 @property 133 def uncategorized(self) -> t.List[Snapshot]: 134 """Returns the uncategorized snapshots.""" 135 if self._uncategorized is None: 136 self._uncategorized = [s for s in self.directly_modified if not s.version] 137 return self._uncategorized 138 139 @property 140 def start(self) -> TimeLike: 141 """Returns the start of the plan or the earliest date of all snapshots.""" 142 return self._start or ( 143 min( 144 start 145 for intervals_per_model in self._missing_intervals.values() 146 for start, _ in intervals_per_model 147 ) 148 if self._missing_intervals 149 else yesterday_ds() 150 ) 151 152 @start.setter 153 def start(self, new_start: TimeLike) -> None: 154 self._ensure_valid_date_range(new_start, self._end) 155 self.set_start(new_start) 156 157 def set_start(self, new_start: TimeLike) -> None: 158 self._start = new_start 159 self.__missing_intervals = None 160 161 @property 162 def end(self) -> TimeLike: 163 """Returns the end of the plan or now.""" 164 return self._end or now() 165 166 @end.setter 167 def end(self, new_end: TimeLike) -> None: 168 self._ensure_valid_date_range(self._start, new_end) 169 self._end = new_end 170 self.__missing_intervals = None 171 172 @property 173 def is_start_and_end_allowed(self) -> bool: 174 """Indicates whether this plan allows to set the start and end dates.""" 175 return self.is_dev or bool(self.restatements) 176 177 @property 178 def requires_backfill(self) -> bool: 179 return not self.skip_backfill and (bool(self.restatements) or bool(self.missing_intervals)) 180 181 @property 182 def missing_intervals(self) -> t.List[MissingIntervals]: 183 """Returns a list of missing intervals.""" 184 return [ 185 MissingIntervals( 186 snapshot_name=snapshot.name, 187 intervals=self._missing_intervals[snapshot.version_get_or_generate()], 188 ) 189 for snapshot in self.snapshots 190 if snapshot.version_get_or_generate() in self._missing_intervals 191 ] 192 193 @property 194 def snapshots(self) -> t.List[Snapshot]: 195 """Gets all the snapshots in the plan/environment.""" 196 return list(self.context_diff.snapshots.values()) 197 198 @property 199 def new_snapshots(self) -> t.List[Snapshot]: 200 """Gets only new snapshots in the plan/environment.""" 201 return list(self.context_diff.new_snapshots.values()) 202 203 @property 204 def environment(self) -> Environment: 205 """The environment of the plan.""" 206 expiration_ts = ( 207 to_timestamp(self.environment_ttl, relative_base=now()) 208 if self.is_dev and self.environment_ttl is not None 209 else None 210 ) 211 return Environment( 212 name=self.context_diff.environment, 213 snapshots=[snapshot.table_info for snapshot in self.snapshots], 214 start_at=self.start, 215 end_at=self._end, 216 plan_id=self.plan_id, 217 previous_plan_id=self.context_diff.previous_plan_id, 218 expiration_ts=expiration_ts, 219 ) 220 221 @property 222 def restatements(self) -> t.Set[str]: 223 return self._restatements 224 225 def is_new_snapshot(self, snapshot: Snapshot) -> bool: 226 """Returns True if the given snapshot is a new snapshot in this plan.""" 227 return snapshot.snapshot_id in self.context_diff.new_snapshots 228 229 def apply(self) -> None: 230 """Runs apply if an apply function was passed in.""" 231 if not self._apply: 232 raise SQLMeshError(f"Plan was not initialized with an applier.") 233 validate_date_range(self.start, self.end) 234 self._apply(self) 235 236 def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None: 237 """Sets a snapshot version based on the user choice. 238 239 Args: 240 snapshot: The target snapshot. 241 choice: The user decision on how to version the target snapshot and its children. 242 """ 243 if self.forward_only: 244 raise PlanError("Choice setting is not supported by a forward-only plan.") 245 if not self.is_new_snapshot(snapshot): 246 raise SQLMeshError( 247 f"A choice can't be changed for the existing version of model '{snapshot.name}'." 248 ) 249 250 snapshot.change_category = choice 251 if choice in ( 252 SnapshotChangeCategory.BREAKING, 253 SnapshotChangeCategory.NON_BREAKING, 254 ): 255 snapshot.set_version() 256 else: 257 snapshot.set_version(snapshot.previous_version) 258 259 for child in self.indirectly_modified[snapshot.name]: 260 child_snapshot = self.context_diff.snapshots[child] 261 262 if choice == SnapshotChangeCategory.BREAKING: 263 child_snapshot.set_version() 264 else: 265 child_snapshot.set_version(child_snapshot.previous_version) 266 snapshot.indirect_versions[child] = child_snapshot.all_versions 267 268 # If any other snapshot specified breaking this child, then that child 269 # needs to be backfilled as a part of the plan. 270 for upstream in self.directly_modified: 271 if child in upstream.indirect_versions: 272 data_version = upstream.indirect_versions[child][-1] 273 if data_version.is_new_version: 274 child_snapshot.set_version() 275 break 276 277 # Invalidate caches. 278 self._categorized = None 279 self._uncategorized = None 280 281 def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory: 282 """ 283 Determines the SnapshotChangeCategory for a modified snapshot using its available history. 284 285 A snapshot may be modified (directly or indirectly) multiple times. Each time 286 it is directly changed, the categorization is stored in its history. Look 287 through the snapshot's history to find where it deviated from the previous 288 snapshot and then find the most conservative categorization recorded. 289 290 Args: 291 snapshot: The snapshot within this plan 292 """ 293 if snapshot not in self.snapshots: 294 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.") 295 296 if not snapshot.version: 297 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.") 298 299 if snapshot.name not in self.context_diff.modified_snapshots: 300 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.") 301 302 current, previous = self.context_diff.modified_snapshots[snapshot.name] 303 if current.version == previous.version: 304 # Versions match, so no further history to check 305 return SnapshotChangeCategory.FORWARD_ONLY 306 elif previous.data_version in current.all_versions: 307 # Previous snapshot in the current snapshot's history. Get all versions 308 # since the two matched. 309 index = current.all_versions.index(previous.data_version) 310 versions = current.all_versions[index + 1 :] 311 elif current.data_version in previous.all_versions: 312 # Snapshot is a revert. Look through the previous snapshot's history 313 # and get all versions since it matched the current snapshot. 314 index = previous.all_versions.index(current.data_version) 315 versions = previous.all_versions[index:] 316 else: 317 # Insufficient history, so err on the side of safety 318 return SnapshotChangeCategory.BREAKING 319 320 change_categories = [ 321 version.change_category for version in versions if version.change_category 322 ] 323 # Return the most conservative categorization found in the snapshot's history 324 return min(change_categories, key=lambda x: x.value) 325 326 @property 327 def _missing_intervals(self) -> t.Dict[str, Intervals]: 328 if self.__missing_intervals is None: 329 previous_ids = [ 330 SnapshotId( 331 name=snapshot.name, 332 identifier=snapshot.previous_version.fingerprint.to_identifier(), 333 ) 334 for snapshot in self.snapshots 335 if snapshot.previous_version 336 ] 337 338 previous_snapshots = ( 339 list(self._state_reader.get_snapshots(previous_ids).values()) 340 if previous_ids 341 else [] 342 ) 343 344 end = self.end 345 self.__missing_intervals = { 346 snapshot.version_get_or_generate(): missing 347 for snapshot, missing in self._state_reader.missing_intervals( 348 previous_snapshots + list(self.snapshots), 349 start=self._start or scheduler.earliest_start_date(self.snapshots), 350 end=end, 351 latest=end, 352 restatements=self.restatements, 353 ).items() 354 } 355 356 return self.__missing_intervals 357 358 def _add_restatements(self, restate_models: t.Iterable[str]) -> None: 359 for table in restate_models: 360 downstream = self._dag.downstream(table) 361 if table in self.context_diff.snapshots: 362 downstream.append(table) 363 364 snapshots = self.context_diff.snapshots 365 downstream = [d for d in downstream if snapshots[d].is_materialized] 366 367 if not downstream: 368 raise PlanError( 369 f"Cannot restate from '{table}'. Either such model doesn't exist or no other model references it." 370 ) 371 self._restatements.update(downstream) 372 373 def _build_directly_and_indirectly_modified(self) -> t.Tuple[t.List[Snapshot], SnapshotMapping]: 374 """Builds collections of directly and inderectly modified snapshots. 375 376 Returns: 377 The tuple in which the first element contains a list of added and directly modified 378 snapshots while the second element contains a mapping of indirectly modified snapshots. 379 """ 380 directly_modified = [] 381 all_indirectly_modified = set() 382 383 for model_name, snapshot in self.context_diff.snapshots.items(): 384 if model_name in self.context_diff.modified_snapshots: 385 if self.context_diff.directly_modified(model_name): 386 directly_modified.append(snapshot) 387 else: 388 all_indirectly_modified.add(model_name) 389 elif model_name in self.context_diff.added: 390 directly_modified.append(snapshot) 391 392 indirectly_modified: SnapshotMapping = defaultdict(set) 393 for snapshot in directly_modified: 394 for downstream in self._dag.downstream(snapshot.name): 395 if downstream in all_indirectly_modified: 396 indirectly_modified[snapshot.name].add(downstream) 397 398 return ( 399 directly_modified, 400 indirectly_modified, 401 ) 402 403 def _categorize_snapshots(self) -> None: 404 """Automatically categorizes snapshots that can be automatically categorized and 405 returns a list of added and directly modified snapshots as well as the mapping of 406 indirectly modified snapshots. 407 """ 408 for model_name, snapshot in self.context_diff.snapshots.items(): 409 upstream_model_names = self._dag.upstream(model_name) 410 411 if not self.forward_only: 412 self._ensure_no_paused_forward_only_upstream(model_name, upstream_model_names) 413 414 if model_name in self.context_diff.modified_snapshots: 415 is_directly_modified = self.context_diff.directly_modified(model_name) 416 417 if self.is_new_snapshot(snapshot): 418 if self.forward_only: 419 # In case of the forward only plan any modifications result in reuse of the 420 # previous version for non-seed models. 421 # New snapshots of seed models are considered non-breaking ones. 422 if not snapshot.is_seed_kind: 423 snapshot.set_version(snapshot.previous_version) 424 snapshot.change_category = SnapshotChangeCategory.FORWARD_ONLY 425 else: 426 snapshot.set_version() 427 snapshot.change_category = SnapshotChangeCategory.NON_BREAKING 428 elif self.auto_categorization_enabled and is_directly_modified: 429 new, old = self.context_diff.modified_snapshots[model_name] 430 change_category = categorize_change( 431 new, old, config=self.categorizer_config 432 ) 433 if change_category is not None: 434 self.set_choice(new, change_category) 435 436 # set to breaking if an indirect child has no directly modified parents 437 # that need a decision. this can happen when a revert to a parent causes 438 # an indirectly modified snapshot to be created because of a new parent 439 if ( 440 not is_directly_modified 441 and not snapshot.version 442 and not any( 443 self.context_diff.directly_modified(upstream) 444 and not self.context_diff.snapshots[upstream].version 445 for upstream in upstream_model_names 446 ) 447 ): 448 snapshot.set_version() 449 450 elif model_name in self.context_diff.added and self.is_new_snapshot(snapshot): 451 snapshot.set_version() 452 453 def _ensure_no_paused_forward_only_upstream( 454 self, model_name: str, upstream_model_names: t.Iterable[str] 455 ) -> None: 456 for upstream in upstream_model_names: 457 upstream_snapshot = self.context_diff.snapshots.get(upstream) 458 if ( 459 upstream_snapshot 460 and upstream_snapshot.version 461 and upstream_snapshot.is_forward_only 462 and upstream_snapshot.is_paused 463 ): 464 raise PlanError( 465 f"Model '{model_name}' depends on a paused version of model '{upstream}'. " 466 "Possible remedies: " 467 "1) make sure your codebase is up-to-date; " 468 f"2) promote the current version of model '{upstream}' in the production environment; " 469 "3) recreate this plan in a forward-only mode." 470 ) 471 472 def _ensure_valid_date_range( 473 self, start: t.Optional[TimeLike], end: t.Optional[TimeLike] 474 ) -> None: 475 if (start or end) and not self.is_start_and_end_allowed: 476 raise PlanError( 477 "The start and end dates can't be set for a production plan without restatements." 478 ) 479 480 def _ensure_no_forward_only_revert(self) -> None: 481 """Ensures that a previously superseded breaking / non-breaking snapshot is not being 482 used again to replace an existing forward-only snapshot with the same version. 483 484 In other words there is no going back to the original non-forward-only snapshot with 485 the same version once a forward-only change for that version has been introduced. 486 """ 487 for name, (candidate, promoted) in self.context_diff.modified_snapshots.items(): 488 if ( 489 candidate.snapshot_id not in self.context_diff.new_snapshots 490 and promoted.is_forward_only 491 and not candidate.is_forward_only 492 and ( 493 promoted.version == candidate.version 494 or candidate.data_version in promoted.previous_versions 495 ) 496 ): 497 raise PlanError( 498 f"Detected an existing version of model '{name}' that has been previously superseded by a forward-only change. " 499 "To proceed with the change, restamp this model's definition to produce a new version." 500 ) 501 502 def _ensure_no_forward_only_new_models(self) -> None: 503 if self.forward_only and self.context_diff.added: 504 raise PlanError("New models can't be added as part of the forward-only plan.") 505 506 507class PlanStatus(str, Enum): 508 STARTED = "started" 509 FINISHED = "finished" 510 FAILED = "failed" 511 512 @property 513 def is_started(self) -> bool: 514 return self == PlanStatus.STARTED 515 516 @property 517 def is_failed(self) -> bool: 518 return self == PlanStatus.FAILED 519 520 @property 521 def is_finished(self) -> bool: 522 return self == PlanStatus.FINISHED 523 524 525class MissingIntervals(PydanticModel, frozen=True): 526 snapshot_name: str 527 intervals: Intervals 528 529 @property 530 def merged_intervals(self) -> Intervals: 531 return merge_intervals(self.intervals) 532 533 def format_missing_range(self) -> str: 534 intervals = [make_inclusive(start, end) for start, end in self.merged_intervals] 535 return ", ".join(f"({to_ds(start)}, {to_ds(end)})" for start, end in intervals)
38class Plan: 39 """Plan is the main class to represent user choices on how they want to backfill and version their models. 40 41 Args: 42 context_diff: The context diff that the plan is based on. 43 dag: The dag object to determine relationships. 44 state_reader: The state_reader to get metadata with. 45 start: The start time to backfill data. 46 end: The end time to backfill data. 47 apply: The callback to apply the plan. 48 restate_models: A list of models for which the data should be restated for the time range 49 specified in this plan. Note: models defined outside SQLMesh (external) won't be a part 50 of the restatement. 51 no_gaps: Whether to ensure that new snapshots for models that are already a 52 part of the target environment have no data gaps when compared against previous 53 snapshots for same models. 54 skip_backfill: Whether to skip the backfill step. 55 is_dev: Whether this plan is for development purposes. 56 forward_only: Whether the purpose of the plan is to make forward only changes. 57 environment_ttl: The period of time that a development environment should exist before being deleted. 58 categorizer_config: Auto categorization settings. 59 auto_categorization_enabled: Whether to apply auto categorization. 60 """ 61 62 def __init__( 63 self, 64 context_diff: ContextDiff, 65 dag: DAG, 66 state_reader: StateReader, 67 start: t.Optional[TimeLike] = None, 68 end: t.Optional[TimeLike] = None, 69 apply: t.Optional[t.Callable[[Plan], None]] = None, 70 restate_models: t.Optional[t.Iterable[str]] = None, 71 no_gaps: bool = False, 72 skip_backfill: bool = False, 73 is_dev: bool = False, 74 forward_only: bool = False, 75 environment_ttl: t.Optional[str] = None, 76 categorizer_config: t.Optional[CategorizerConfig] = None, 77 auto_categorization_enabled: bool = True, 78 ): 79 self.context_diff = context_diff 80 self.override_start = start is not None 81 self.override_end = end is not None 82 self.plan_id: str = random_id() 83 self.no_gaps = no_gaps 84 self.skip_backfill = skip_backfill 85 self.is_dev = is_dev 86 self.forward_only = forward_only 87 self.environment_ttl = environment_ttl 88 self.categorizer_config = categorizer_config or CategorizerConfig() 89 self.auto_categorization_enabled = auto_categorization_enabled 90 self._start = start if start or not (is_dev and forward_only) else yesterday_ds() 91 self._end = end if end or not is_dev else now() 92 self._apply = apply 93 self._dag = dag 94 self._state_reader = state_reader 95 self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None 96 self._restatements: t.Set[str] = set() 97 98 if restate_models and context_diff.new_snapshots: 99 raise PlanError( 100 "Model changes and restatements can't be a part of the same plan. " 101 "Revert or apply changes before proceeding with restatements." 102 ) 103 104 if not restate_models and is_dev and forward_only: 105 # Add model names for new forward-only snapshots to the restatement list 106 # in order to compute previews. 107 restate_models = [ 108 s.name for s in context_diff.new_snapshots.values() if s.is_materialized 109 ] 110 111 self._add_restatements(restate_models or []) 112 113 self._ensure_valid_date_range(self._start, self._end) 114 self._ensure_no_forward_only_revert() 115 self._ensure_no_forward_only_new_models() 116 117 directly_indirectly_modified = self._build_directly_and_indirectly_modified() 118 self.directly_modified = directly_indirectly_modified[0] 119 self.indirectly_modified = directly_indirectly_modified[1] 120 121 self._categorize_snapshots() 122 123 self._categorized: t.Optional[t.List[Snapshot]] = None 124 self._uncategorized: t.Optional[t.List[Snapshot]] = None 125 126 @property 127 def categorized(self) -> t.List[Snapshot]: 128 """Returns the already categorized snapshots.""" 129 if self._categorized is None: 130 self._categorized = [s for s in self.directly_modified if s.version] 131 return self._categorized 132 133 @property 134 def uncategorized(self) -> t.List[Snapshot]: 135 """Returns the uncategorized snapshots.""" 136 if self._uncategorized is None: 137 self._uncategorized = [s for s in self.directly_modified if not s.version] 138 return self._uncategorized 139 140 @property 141 def start(self) -> TimeLike: 142 """Returns the start of the plan or the earliest date of all snapshots.""" 143 return self._start or ( 144 min( 145 start 146 for intervals_per_model in self._missing_intervals.values() 147 for start, _ in intervals_per_model 148 ) 149 if self._missing_intervals 150 else yesterday_ds() 151 ) 152 153 @start.setter 154 def start(self, new_start: TimeLike) -> None: 155 self._ensure_valid_date_range(new_start, self._end) 156 self.set_start(new_start) 157 158 def set_start(self, new_start: TimeLike) -> None: 159 self._start = new_start 160 self.__missing_intervals = None 161 162 @property 163 def end(self) -> TimeLike: 164 """Returns the end of the plan or now.""" 165 return self._end or now() 166 167 @end.setter 168 def end(self, new_end: TimeLike) -> None: 169 self._ensure_valid_date_range(self._start, new_end) 170 self._end = new_end 171 self.__missing_intervals = None 172 173 @property 174 def is_start_and_end_allowed(self) -> bool: 175 """Indicates whether this plan allows to set the start and end dates.""" 176 return self.is_dev or bool(self.restatements) 177 178 @property 179 def requires_backfill(self) -> bool: 180 return not self.skip_backfill and (bool(self.restatements) or bool(self.missing_intervals)) 181 182 @property 183 def missing_intervals(self) -> t.List[MissingIntervals]: 184 """Returns a list of missing intervals.""" 185 return [ 186 MissingIntervals( 187 snapshot_name=snapshot.name, 188 intervals=self._missing_intervals[snapshot.version_get_or_generate()], 189 ) 190 for snapshot in self.snapshots 191 if snapshot.version_get_or_generate() in self._missing_intervals 192 ] 193 194 @property 195 def snapshots(self) -> t.List[Snapshot]: 196 """Gets all the snapshots in the plan/environment.""" 197 return list(self.context_diff.snapshots.values()) 198 199 @property 200 def new_snapshots(self) -> t.List[Snapshot]: 201 """Gets only new snapshots in the plan/environment.""" 202 return list(self.context_diff.new_snapshots.values()) 203 204 @property 205 def environment(self) -> Environment: 206 """The environment of the plan.""" 207 expiration_ts = ( 208 to_timestamp(self.environment_ttl, relative_base=now()) 209 if self.is_dev and self.environment_ttl is not None 210 else None 211 ) 212 return Environment( 213 name=self.context_diff.environment, 214 snapshots=[snapshot.table_info for snapshot in self.snapshots], 215 start_at=self.start, 216 end_at=self._end, 217 plan_id=self.plan_id, 218 previous_plan_id=self.context_diff.previous_plan_id, 219 expiration_ts=expiration_ts, 220 ) 221 222 @property 223 def restatements(self) -> t.Set[str]: 224 return self._restatements 225 226 def is_new_snapshot(self, snapshot: Snapshot) -> bool: 227 """Returns True if the given snapshot is a new snapshot in this plan.""" 228 return snapshot.snapshot_id in self.context_diff.new_snapshots 229 230 def apply(self) -> None: 231 """Runs apply if an apply function was passed in.""" 232 if not self._apply: 233 raise SQLMeshError(f"Plan was not initialized with an applier.") 234 validate_date_range(self.start, self.end) 235 self._apply(self) 236 237 def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None: 238 """Sets a snapshot version based on the user choice. 239 240 Args: 241 snapshot: The target snapshot. 242 choice: The user decision on how to version the target snapshot and its children. 243 """ 244 if self.forward_only: 245 raise PlanError("Choice setting is not supported by a forward-only plan.") 246 if not self.is_new_snapshot(snapshot): 247 raise SQLMeshError( 248 f"A choice can't be changed for the existing version of model '{snapshot.name}'." 249 ) 250 251 snapshot.change_category = choice 252 if choice in ( 253 SnapshotChangeCategory.BREAKING, 254 SnapshotChangeCategory.NON_BREAKING, 255 ): 256 snapshot.set_version() 257 else: 258 snapshot.set_version(snapshot.previous_version) 259 260 for child in self.indirectly_modified[snapshot.name]: 261 child_snapshot = self.context_diff.snapshots[child] 262 263 if choice == SnapshotChangeCategory.BREAKING: 264 child_snapshot.set_version() 265 else: 266 child_snapshot.set_version(child_snapshot.previous_version) 267 snapshot.indirect_versions[child] = child_snapshot.all_versions 268 269 # If any other snapshot specified breaking this child, then that child 270 # needs to be backfilled as a part of the plan. 271 for upstream in self.directly_modified: 272 if child in upstream.indirect_versions: 273 data_version = upstream.indirect_versions[child][-1] 274 if data_version.is_new_version: 275 child_snapshot.set_version() 276 break 277 278 # Invalidate caches. 279 self._categorized = None 280 self._uncategorized = None 281 282 def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory: 283 """ 284 Determines the SnapshotChangeCategory for a modified snapshot using its available history. 285 286 A snapshot may be modified (directly or indirectly) multiple times. Each time 287 it is directly changed, the categorization is stored in its history. Look 288 through the snapshot's history to find where it deviated from the previous 289 snapshot and then find the most conservative categorization recorded. 290 291 Args: 292 snapshot: The snapshot within this plan 293 """ 294 if snapshot not in self.snapshots: 295 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.") 296 297 if not snapshot.version: 298 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.") 299 300 if snapshot.name not in self.context_diff.modified_snapshots: 301 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.") 302 303 current, previous = self.context_diff.modified_snapshots[snapshot.name] 304 if current.version == previous.version: 305 # Versions match, so no further history to check 306 return SnapshotChangeCategory.FORWARD_ONLY 307 elif previous.data_version in current.all_versions: 308 # Previous snapshot in the current snapshot's history. Get all versions 309 # since the two matched. 310 index = current.all_versions.index(previous.data_version) 311 versions = current.all_versions[index + 1 :] 312 elif current.data_version in previous.all_versions: 313 # Snapshot is a revert. Look through the previous snapshot's history 314 # and get all versions since it matched the current snapshot. 315 index = previous.all_versions.index(current.data_version) 316 versions = previous.all_versions[index:] 317 else: 318 # Insufficient history, so err on the side of safety 319 return SnapshotChangeCategory.BREAKING 320 321 change_categories = [ 322 version.change_category for version in versions if version.change_category 323 ] 324 # Return the most conservative categorization found in the snapshot's history 325 return min(change_categories, key=lambda x: x.value) 326 327 @property 328 def _missing_intervals(self) -> t.Dict[str, Intervals]: 329 if self.__missing_intervals is None: 330 previous_ids = [ 331 SnapshotId( 332 name=snapshot.name, 333 identifier=snapshot.previous_version.fingerprint.to_identifier(), 334 ) 335 for snapshot in self.snapshots 336 if snapshot.previous_version 337 ] 338 339 previous_snapshots = ( 340 list(self._state_reader.get_snapshots(previous_ids).values()) 341 if previous_ids 342 else [] 343 ) 344 345 end = self.end 346 self.__missing_intervals = { 347 snapshot.version_get_or_generate(): missing 348 for snapshot, missing in self._state_reader.missing_intervals( 349 previous_snapshots + list(self.snapshots), 350 start=self._start or scheduler.earliest_start_date(self.snapshots), 351 end=end, 352 latest=end, 353 restatements=self.restatements, 354 ).items() 355 } 356 357 return self.__missing_intervals 358 359 def _add_restatements(self, restate_models: t.Iterable[str]) -> None: 360 for table in restate_models: 361 downstream = self._dag.downstream(table) 362 if table in self.context_diff.snapshots: 363 downstream.append(table) 364 365 snapshots = self.context_diff.snapshots 366 downstream = [d for d in downstream if snapshots[d].is_materialized] 367 368 if not downstream: 369 raise PlanError( 370 f"Cannot restate from '{table}'. Either such model doesn't exist or no other model references it." 371 ) 372 self._restatements.update(downstream) 373 374 def _build_directly_and_indirectly_modified(self) -> t.Tuple[t.List[Snapshot], SnapshotMapping]: 375 """Builds collections of directly and inderectly modified snapshots. 376 377 Returns: 378 The tuple in which the first element contains a list of added and directly modified 379 snapshots while the second element contains a mapping of indirectly modified snapshots. 380 """ 381 directly_modified = [] 382 all_indirectly_modified = set() 383 384 for model_name, snapshot in self.context_diff.snapshots.items(): 385 if model_name in self.context_diff.modified_snapshots: 386 if self.context_diff.directly_modified(model_name): 387 directly_modified.append(snapshot) 388 else: 389 all_indirectly_modified.add(model_name) 390 elif model_name in self.context_diff.added: 391 directly_modified.append(snapshot) 392 393 indirectly_modified: SnapshotMapping = defaultdict(set) 394 for snapshot in directly_modified: 395 for downstream in self._dag.downstream(snapshot.name): 396 if downstream in all_indirectly_modified: 397 indirectly_modified[snapshot.name].add(downstream) 398 399 return ( 400 directly_modified, 401 indirectly_modified, 402 ) 403 404 def _categorize_snapshots(self) -> None: 405 """Automatically categorizes snapshots that can be automatically categorized and 406 returns a list of added and directly modified snapshots as well as the mapping of 407 indirectly modified snapshots. 408 """ 409 for model_name, snapshot in self.context_diff.snapshots.items(): 410 upstream_model_names = self._dag.upstream(model_name) 411 412 if not self.forward_only: 413 self._ensure_no_paused_forward_only_upstream(model_name, upstream_model_names) 414 415 if model_name in self.context_diff.modified_snapshots: 416 is_directly_modified = self.context_diff.directly_modified(model_name) 417 418 if self.is_new_snapshot(snapshot): 419 if self.forward_only: 420 # In case of the forward only plan any modifications result in reuse of the 421 # previous version for non-seed models. 422 # New snapshots of seed models are considered non-breaking ones. 423 if not snapshot.is_seed_kind: 424 snapshot.set_version(snapshot.previous_version) 425 snapshot.change_category = SnapshotChangeCategory.FORWARD_ONLY 426 else: 427 snapshot.set_version() 428 snapshot.change_category = SnapshotChangeCategory.NON_BREAKING 429 elif self.auto_categorization_enabled and is_directly_modified: 430 new, old = self.context_diff.modified_snapshots[model_name] 431 change_category = categorize_change( 432 new, old, config=self.categorizer_config 433 ) 434 if change_category is not None: 435 self.set_choice(new, change_category) 436 437 # set to breaking if an indirect child has no directly modified parents 438 # that need a decision. this can happen when a revert to a parent causes 439 # an indirectly modified snapshot to be created because of a new parent 440 if ( 441 not is_directly_modified 442 and not snapshot.version 443 and not any( 444 self.context_diff.directly_modified(upstream) 445 and not self.context_diff.snapshots[upstream].version 446 for upstream in upstream_model_names 447 ) 448 ): 449 snapshot.set_version() 450 451 elif model_name in self.context_diff.added and self.is_new_snapshot(snapshot): 452 snapshot.set_version() 453 454 def _ensure_no_paused_forward_only_upstream( 455 self, model_name: str, upstream_model_names: t.Iterable[str] 456 ) -> None: 457 for upstream in upstream_model_names: 458 upstream_snapshot = self.context_diff.snapshots.get(upstream) 459 if ( 460 upstream_snapshot 461 and upstream_snapshot.version 462 and upstream_snapshot.is_forward_only 463 and upstream_snapshot.is_paused 464 ): 465 raise PlanError( 466 f"Model '{model_name}' depends on a paused version of model '{upstream}'. " 467 "Possible remedies: " 468 "1) make sure your codebase is up-to-date; " 469 f"2) promote the current version of model '{upstream}' in the production environment; " 470 "3) recreate this plan in a forward-only mode." 471 ) 472 473 def _ensure_valid_date_range( 474 self, start: t.Optional[TimeLike], end: t.Optional[TimeLike] 475 ) -> None: 476 if (start or end) and not self.is_start_and_end_allowed: 477 raise PlanError( 478 "The start and end dates can't be set for a production plan without restatements." 479 ) 480 481 def _ensure_no_forward_only_revert(self) -> None: 482 """Ensures that a previously superseded breaking / non-breaking snapshot is not being 483 used again to replace an existing forward-only snapshot with the same version. 484 485 In other words there is no going back to the original non-forward-only snapshot with 486 the same version once a forward-only change for that version has been introduced. 487 """ 488 for name, (candidate, promoted) in self.context_diff.modified_snapshots.items(): 489 if ( 490 candidate.snapshot_id not in self.context_diff.new_snapshots 491 and promoted.is_forward_only 492 and not candidate.is_forward_only 493 and ( 494 promoted.version == candidate.version 495 or candidate.data_version in promoted.previous_versions 496 ) 497 ): 498 raise PlanError( 499 f"Detected an existing version of model '{name}' that has been previously superseded by a forward-only change. " 500 "To proceed with the change, restamp this model's definition to produce a new version." 501 ) 502 503 def _ensure_no_forward_only_new_models(self) -> None: 504 if self.forward_only and self.context_diff.added: 505 raise PlanError("New models can't be added as part of the forward-only plan.")
Plan is the main class to represent user choices on how they want to backfill and version their models.
Arguments:
- context_diff: The context diff that the plan is based on.
- dag: The dag object to determine relationships.
- state_reader: The state_reader to get metadata with.
- start: The start time to backfill data.
- end: The end time to backfill data.
- apply: The callback to apply the plan.
- restate_models: A list of models for which the data should be restated for the time range specified in this plan. Note: models defined outside SQLMesh (external) won't be a part of the restatement.
- no_gaps: Whether to ensure that new snapshots for models that are already a part of the target environment have no data gaps when compared against previous snapshots for same models.
- skip_backfill: Whether to skip the backfill step.
- is_dev: Whether this plan is for development purposes.
- forward_only: Whether the purpose of the plan is to make forward only changes.
- environment_ttl: The period of time that a development environment should exist before being deleted.
- categorizer_config: Auto categorization settings.
- auto_categorization_enabled: Whether to apply auto categorization.
62 def __init__( 63 self, 64 context_diff: ContextDiff, 65 dag: DAG, 66 state_reader: StateReader, 67 start: t.Optional[TimeLike] = None, 68 end: t.Optional[TimeLike] = None, 69 apply: t.Optional[t.Callable[[Plan], None]] = None, 70 restate_models: t.Optional[t.Iterable[str]] = None, 71 no_gaps: bool = False, 72 skip_backfill: bool = False, 73 is_dev: bool = False, 74 forward_only: bool = False, 75 environment_ttl: t.Optional[str] = None, 76 categorizer_config: t.Optional[CategorizerConfig] = None, 77 auto_categorization_enabled: bool = True, 78 ): 79 self.context_diff = context_diff 80 self.override_start = start is not None 81 self.override_end = end is not None 82 self.plan_id: str = random_id() 83 self.no_gaps = no_gaps 84 self.skip_backfill = skip_backfill 85 self.is_dev = is_dev 86 self.forward_only = forward_only 87 self.environment_ttl = environment_ttl 88 self.categorizer_config = categorizer_config or CategorizerConfig() 89 self.auto_categorization_enabled = auto_categorization_enabled 90 self._start = start if start or not (is_dev and forward_only) else yesterday_ds() 91 self._end = end if end or not is_dev else now() 92 self._apply = apply 93 self._dag = dag 94 self._state_reader = state_reader 95 self.__missing_intervals: t.Optional[t.Dict[str, Intervals]] = None 96 self._restatements: t.Set[str] = set() 97 98 if restate_models and context_diff.new_snapshots: 99 raise PlanError( 100 "Model changes and restatements can't be a part of the same plan. " 101 "Revert or apply changes before proceeding with restatements." 102 ) 103 104 if not restate_models and is_dev and forward_only: 105 # Add model names for new forward-only snapshots to the restatement list 106 # in order to compute previews. 107 restate_models = [ 108 s.name for s in context_diff.new_snapshots.values() if s.is_materialized 109 ] 110 111 self._add_restatements(restate_models or []) 112 113 self._ensure_valid_date_range(self._start, self._end) 114 self._ensure_no_forward_only_revert() 115 self._ensure_no_forward_only_new_models() 116 117 directly_indirectly_modified = self._build_directly_and_indirectly_modified() 118 self.directly_modified = directly_indirectly_modified[0] 119 self.indirectly_modified = directly_indirectly_modified[1] 120 121 self._categorize_snapshots() 122 123 self._categorized: t.Optional[t.List[Snapshot]] = None 124 self._uncategorized: t.Optional[t.List[Snapshot]] = None
Returns the already categorized snapshots.
Returns the start of the plan or the earliest date of all snapshots.
Returns a list of missing intervals.
Gets all the snapshots in the plan/environment.
Gets only new snapshots in the plan/environment.
226 def is_new_snapshot(self, snapshot: Snapshot) -> bool: 227 """Returns True if the given snapshot is a new snapshot in this plan.""" 228 return snapshot.snapshot_id in self.context_diff.new_snapshots
Returns True if the given snapshot is a new snapshot in this plan.
230 def apply(self) -> None: 231 """Runs apply if an apply function was passed in.""" 232 if not self._apply: 233 raise SQLMeshError(f"Plan was not initialized with an applier.") 234 validate_date_range(self.start, self.end) 235 self._apply(self)
Runs apply if an apply function was passed in.
237 def set_choice(self, snapshot: Snapshot, choice: SnapshotChangeCategory) -> None: 238 """Sets a snapshot version based on the user choice. 239 240 Args: 241 snapshot: The target snapshot. 242 choice: The user decision on how to version the target snapshot and its children. 243 """ 244 if self.forward_only: 245 raise PlanError("Choice setting is not supported by a forward-only plan.") 246 if not self.is_new_snapshot(snapshot): 247 raise SQLMeshError( 248 f"A choice can't be changed for the existing version of model '{snapshot.name}'." 249 ) 250 251 snapshot.change_category = choice 252 if choice in ( 253 SnapshotChangeCategory.BREAKING, 254 SnapshotChangeCategory.NON_BREAKING, 255 ): 256 snapshot.set_version() 257 else: 258 snapshot.set_version(snapshot.previous_version) 259 260 for child in self.indirectly_modified[snapshot.name]: 261 child_snapshot = self.context_diff.snapshots[child] 262 263 if choice == SnapshotChangeCategory.BREAKING: 264 child_snapshot.set_version() 265 else: 266 child_snapshot.set_version(child_snapshot.previous_version) 267 snapshot.indirect_versions[child] = child_snapshot.all_versions 268 269 # If any other snapshot specified breaking this child, then that child 270 # needs to be backfilled as a part of the plan. 271 for upstream in self.directly_modified: 272 if child in upstream.indirect_versions: 273 data_version = upstream.indirect_versions[child][-1] 274 if data_version.is_new_version: 275 child_snapshot.set_version() 276 break 277 278 # Invalidate caches. 279 self._categorized = None 280 self._uncategorized = None
Sets a snapshot version based on the user choice.
Arguments:
- snapshot: The target snapshot.
- choice: The user decision on how to version the target snapshot and its children.
282 def snapshot_change_category(self, snapshot: Snapshot) -> SnapshotChangeCategory: 283 """ 284 Determines the SnapshotChangeCategory for a modified snapshot using its available history. 285 286 A snapshot may be modified (directly or indirectly) multiple times. Each time 287 it is directly changed, the categorization is stored in its history. Look 288 through the snapshot's history to find where it deviated from the previous 289 snapshot and then find the most conservative categorization recorded. 290 291 Args: 292 snapshot: The snapshot within this plan 293 """ 294 if snapshot not in self.snapshots: 295 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} does not exist in this plan.") 296 297 if not snapshot.version: 298 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not be categorized yet.") 299 300 if snapshot.name not in self.context_diff.modified_snapshots: 301 raise SQLMeshError(f"Snapshot {snapshot.snapshot_id} has not been modified.") 302 303 current, previous = self.context_diff.modified_snapshots[snapshot.name] 304 if current.version == previous.version: 305 # Versions match, so no further history to check 306 return SnapshotChangeCategory.FORWARD_ONLY 307 elif previous.data_version in current.all_versions: 308 # Previous snapshot in the current snapshot's history. Get all versions 309 # since the two matched. 310 index = current.all_versions.index(previous.data_version) 311 versions = current.all_versions[index + 1 :] 312 elif current.data_version in previous.all_versions: 313 # Snapshot is a revert. Look through the previous snapshot's history 314 # and get all versions since it matched the current snapshot. 315 index = previous.all_versions.index(current.data_version) 316 versions = previous.all_versions[index:] 317 else: 318 # Insufficient history, so err on the side of safety 319 return SnapshotChangeCategory.BREAKING 320 321 change_categories = [ 322 version.change_category for version in versions if version.change_category 323 ] 324 # Return the most conservative categorization found in the snapshot's history 325 return min(change_categories, key=lambda x: x.value)
Determines the SnapshotChangeCategory for a modified snapshot using its available history.
A snapshot may be modified (directly or indirectly) multiple times. Each time it is directly changed, the categorization is stored in its history. Look through the snapshot's history to find where it deviated from the previous snapshot and then find the most conservative categorization recorded.
Arguments:
- snapshot: The snapshot within this plan
508class PlanStatus(str, Enum): 509 STARTED = "started" 510 FINISHED = "finished" 511 FAILED = "failed" 512 513 @property 514 def is_started(self) -> bool: 515 return self == PlanStatus.STARTED 516 517 @property 518 def is_failed(self) -> bool: 519 return self == PlanStatus.FAILED 520 521 @property 522 def is_finished(self) -> bool: 523 return self == PlanStatus.FINISHED
An enumeration.
Inherited Members
- enum.Enum
- name
- value
- builtins.str
- encode
- replace
- split
- rsplit
- join
- capitalize
- casefold
- title
- center
- count
- expandtabs
- find
- partition
- index
- ljust
- lower
- lstrip
- rfind
- rindex
- rjust
- rstrip
- rpartition
- splitlines
- strip
- swapcase
- translate
- upper
- startswith
- endswith
- removeprefix
- removesuffix
- isascii
- islower
- isupper
- istitle
- isspace
- isdecimal
- isdigit
- isnumeric
- isalpha
- isalnum
- isidentifier
- isprintable
- zfill
- format
- format_map
- maketrans
526class MissingIntervals(PydanticModel, frozen=True): 527 snapshot_name: str 528 intervals: Intervals 529 530 @property 531 def merged_intervals(self) -> Intervals: 532 return merge_intervals(self.intervals) 533 534 def format_missing_range(self) -> str: 535 intervals = [make_inclusive(start, end) for start, end in self.merged_intervals] 536 return ", ".join(f"({to_ds(start)}, {to_ds(end)})" for start, end in intervals)
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs