sqlmesh.core.snapshot.definition
1from __future__ import annotations 2 3import typing as t 4import zlib 5from collections import defaultdict 6from enum import IntEnum 7 8from croniter import croniter_range 9from pydantic import validator 10from sqlglot import exp 11 12from sqlmesh.core import constants as c 13from sqlmesh.core.audit import Audit 14from sqlmesh.core.model import ( 15 Model, 16 PythonModel, 17 SeedModel, 18 SqlModel, 19 kind, 20 parse_model_name, 21) 22from sqlmesh.core.model.meta import HookCall 23from sqlmesh.utils.date import ( 24 TimeLike, 25 make_inclusive, 26 now, 27 now_timestamp, 28 to_datetime, 29 to_timestamp, 30) 31from sqlmesh.utils.errors import SQLMeshError 32from sqlmesh.utils.pydantic import PydanticModel 33 34Interval = t.Tuple[int, int] 35Intervals = t.List[Interval] 36 37 38class SnapshotChangeCategory(IntEnum): 39 """ 40 Values are ordered by decreasing severity and that ordering is required. 41 42 BREAKING: The change requires that snapshot modified and downstream dependencies be rebuilt 43 NON_BREAKING: The change requires that only the snapshot modified be rebuilt 44 NO_CHANGE: The change requires no rebuilding 45 """ 46 47 BREAKING = 1 48 NON_BREAKING = 2 49 FORWARD_ONLY = 3 50 51 52class SnapshotFingerprint(PydanticModel, frozen=True): 53 data_hash: str 54 metadata_hash: str 55 parent_data_hash: str = "0" 56 parent_metadata_hash: str = "0" 57 58 def to_version(self) -> str: 59 return _hash([self.data_hash, self.parent_data_hash]) 60 61 def to_identifier(self) -> str: 62 return _hash( 63 [ 64 self.data_hash, 65 self.metadata_hash, 66 self.parent_data_hash, 67 self.parent_metadata_hash, 68 ] 69 ) 70 71 72class SnapshotId(PydanticModel, frozen=True): 73 name: str 74 identifier: str 75 76 @property 77 def snapshot_id(self) -> SnapshotId: 78 """Helper method to return self.""" 79 return self 80 81 82class SnapshotNameVersion(PydanticModel, frozen=True): 83 name: str 84 version: str 85 86 87class SnapshotDataVersion(PydanticModel, frozen=True): 88 fingerprint: SnapshotFingerprint 89 version: str 90 change_category: t.Optional[SnapshotChangeCategory] 91 92 @property 93 def data_version(self) -> SnapshotDataVersion: 94 return self 95 96 @property 97 def is_new_version(self) -> bool: 98 """Returns whether or not this version is new and requires a backfill.""" 99 return self.fingerprint.to_version() == self.version 100 101 102class QualifiedViewName(PydanticModel, frozen=True): 103 catalog: t.Optional[str] 104 schema_name: t.Optional[str] 105 table: str 106 107 def for_environment(self, environment: str) -> str: 108 return ".".join( 109 p 110 for p in ( 111 self.catalog, 112 self.schema_for_environment(environment), 113 self.table, 114 ) 115 if p is not None 116 ) 117 118 def schema_for_environment(self, environment: str) -> str: 119 schema = self.schema_name or "default" 120 if environment.lower() != c.PROD: 121 schema = f"{schema}__{environment}" 122 return schema 123 124 125class SnapshotInfoMixin: 126 name: str 127 fingerprint: SnapshotFingerprint 128 physical_schema: str 129 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 130 131 def is_temporary_table(self, is_dev: bool) -> bool: 132 """Provided whether the snapshot is used in a development mode or not, returns True 133 if the snapshot targets a temporary table or a clone and False otherwise. 134 """ 135 return is_dev and not self.is_new_version 136 137 @property 138 def identifier(self) -> str: 139 return self.fingerprint.to_identifier() 140 141 @property 142 def snapshot_id(self) -> SnapshotId: 143 return SnapshotId(name=self.name, identifier=self.identifier) 144 145 @property 146 def qualified_view_name(self) -> QualifiedViewName: 147 (catalog, schema, table) = parse_model_name(self.name) 148 return QualifiedViewName(catalog=catalog, schema_name=schema, table=table) 149 150 @property 151 def previous_version(self) -> t.Optional[SnapshotDataVersion]: 152 """Helper method to get the previous data version.""" 153 if self.previous_versions: 154 return self.previous_versions[-1] 155 return None 156 157 @property 158 def data_version(self) -> SnapshotDataVersion: 159 raise NotImplementedError 160 161 @property 162 def is_new_version(self) -> bool: 163 raise NotImplementedError 164 165 @property 166 def is_forward_only(self) -> bool: 167 return not self.data_hash_matches(self.previous_version) and not self.is_new_version 168 169 @property 170 def all_versions(self) -> t.Tuple[SnapshotDataVersion, ...]: 171 """Returns previous versions with the current version trimmed to DATA_VERSION_LIMIT.""" 172 return (*self.previous_versions, self.data_version)[-c.DATA_VERSION_LIMIT :] 173 174 def data_hash_matches(self, other: t.Optional[SnapshotInfoMixin | SnapshotDataVersion]) -> bool: 175 return other is not None and self.fingerprint.data_hash == other.fingerprint.data_hash 176 177 def _table_name(self, version: str, is_dev: bool, for_read: bool) -> str: 178 """Full table name pointing to the materialized location of the snapshot. 179 180 Args: 181 version: The snapshot version. 182 is_dev: Whether the table name will be used in development mode. 183 for_read: Whether the table name will be used for reading by a different snapshot. 184 """ 185 if is_dev and for_read: 186 # If this snapshot is used for reading, return a temporary table 187 # only if this snapshot captures direct changes applied to its model. 188 version = self.fingerprint.to_version() if self.is_forward_only else version 189 is_temp = self.is_temporary_table(True) and self.is_forward_only 190 elif is_dev: 191 version = self.fingerprint.to_version() 192 is_temp = self.is_temporary_table(True) 193 else: 194 is_temp = False 195 196 return table_name( 197 self.physical_schema, 198 self.name, 199 version, 200 is_temp=is_temp, 201 ) 202 203 204class SnapshotTableInfo(PydanticModel, SnapshotInfoMixin, frozen=True): 205 name: str 206 fingerprint: SnapshotFingerprint 207 version: str 208 physical_schema: str 209 parents: t.Tuple[SnapshotId, ...] 210 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 211 change_category: t.Optional[SnapshotChangeCategory] 212 is_materialized: bool 213 is_embedded_kind: bool 214 215 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 216 """Full table name pointing to the materialized location of the snapshot. 217 218 Args: 219 is_dev: Whether the table name will be used in development mode. 220 for_read: Whether the table name will be used for reading by a different snapshot. 221 """ 222 return self._table_name(self.version, is_dev, for_read) 223 224 @property 225 def table_info(self) -> SnapshotTableInfo: 226 """Helper method to return self.""" 227 return self 228 229 @property 230 def data_version(self) -> SnapshotDataVersion: 231 return SnapshotDataVersion( 232 fingerprint=self.fingerprint, 233 version=self.version, 234 change_category=self.change_category, 235 ) 236 237 @property 238 def is_new_version(self) -> bool: 239 """Returns whether or not this version is new and requires a backfill.""" 240 return self.fingerprint.to_version() == self.version 241 242 243class Snapshot(PydanticModel, SnapshotInfoMixin): 244 """A snapshot represents a model at a certain point in time. 245 246 Snapshots are used to encapsulate everything needed to evaluate a model. 247 They are standalone objects that hold all state and dynamic content necessary 248 to render a model's query including things like macros. Snapshots also store intervals 249 (timestamp ranges for what data we've processed). 250 251 Models can be dynamically rendered due to macros. Rendering a model to its full extent 252 requires storing variables and macro definitions. We store all of the macro definitions and 253 global variable references in `python_env` in raw text to avoid pickling. The helper methods 254 to achieve this are defined in utils.metaprogramming. 255 256 Args: 257 name: The snapshot name which is the same as the model name and should be unique per model. 258 259 fingerprint: A unique hash of the model definition so that models can be reused across environments. 260 physical_schema: The physical schema that the snapshot is stored in. 261 model: Model object that the snapshot encapsulates. 262 parents: The list of parent snapshots (upstream dependencies). 263 audits: The list of audits used by the model. 264 intervals: List of [start, end) intervals showing which time ranges a snapshot has data for. 265 created_ts: Epoch millis timestamp when a snapshot was first created. 266 updated_ts: Epoch millis timestamp when a snapshot was last updated. 267 ttl: The time-to-live of a snapshot determines when it should be deleted after it's no longer referenced 268 in any environment. 269 previous: The snapshot data version that this snapshot was based on. If this snapshot is new, then previous will be None. 270 version: User specified version for a snapshot that is used for physical storage. 271 By default, the version is the fingerprint, but not all changes to models require a backfill. 272 If a user passes a previous version, that will be used instead and no backfill will be required. 273 change_category: User specified change category indicating which models require backfill from model changes made in this snapshot. 274 unpaused_ts: The timestamp which indicates when this snapshot was unpaused. Unpaused means that 275 this snapshot is evaluated on a recurring basis. None indicates that this snapshot is paused. 276 """ 277 278 name: str 279 fingerprint: SnapshotFingerprint 280 physical_schema: str 281 model: Model 282 parents: t.Tuple[SnapshotId, ...] 283 audits: t.Tuple[Audit, ...] 284 intervals: Intervals 285 dev_intervals: Intervals 286 created_ts: int 287 updated_ts: int 288 ttl: str 289 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 290 indirect_versions: t.Dict[str, t.Tuple[SnapshotDataVersion, ...]] = {} 291 version: t.Optional[str] = None 292 change_category: t.Optional[SnapshotChangeCategory] = None 293 unpaused_ts: t.Optional[int] = None 294 295 @validator("ttl") 296 @classmethod 297 def _time_delta_must_be_positive(cls, v: str) -> str: 298 current_time = now() 299 if to_datetime(v, current_time) < current_time: 300 raise ValueError( 301 "Must be positive. Use the 'in' keyword to denote a positive time interval. For example, 'in 7 days'." 302 ) 303 return v 304 305 @staticmethod 306 def merge_snapshots( 307 targets: t.Iterable[SnapshotIdLike], 308 snapshots: t.Dict[SnapshotId, Snapshot], 309 ) -> t.List[Snapshot]: 310 """Merge target snapshots with others so that each target snapshot has intervals from all other snapshots with the same version. 311 312 Args: 313 targets: Iterable of snapshot-like objects 314 snapshots: Dictionary of snapshot ids to snapshot. 315 316 Returns: 317 List of target snapshots with merged intervals. 318 """ 319 merged = [] 320 snapshots_by_name_version = defaultdict(list) 321 322 for s in snapshots.values(): 323 snapshots_by_name_version[(s.name, s.version)].append(s) 324 325 for snapshot_like in targets: 326 snapshot_id = snapshot_like.snapshot_id 327 snapshot = snapshots.get(snapshot_id) 328 if not snapshot: 329 raise SQLMeshError(f"The snapshot {snapshot_id} was not found") 330 331 snapshot = snapshot.copy() 332 snapshot.intervals = [] 333 334 for other in snapshots_by_name_version[(snapshot.name, snapshot.version)]: 335 snapshot.merge_intervals(other) 336 337 merged.append(snapshot) 338 339 return merged 340 341 @classmethod 342 def from_model( 343 cls, 344 model: Model, 345 *, 346 physical_schema: str, 347 models: t.Dict[str, Model], 348 ttl: str = c.DEFAULT_SNAPSHOT_TTL, 349 version: t.Optional[str] = None, 350 audits: t.Optional[t.Dict[str, Audit]] = None, 351 cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, 352 ) -> Snapshot: 353 """Creates a new snapshot for a model. 354 355 Args: 356 model: Model to snapshot. 357 physical_schema: The schema of the snapshot which represents where it is stored. 358 models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. 359 If no dictionary is passed in the fingerprint will not be dependent on a model's parents. 360 ttl: A TTL to determine how long orphaned (snapshots that are not promoted anywhere) should live. 361 version: The version that a snapshot is associated with. Usually set during the planning phase. 362 audits: Available audits by name. 363 cache: Cache of model name to fingerprints. 364 365 Returns: 366 The newly created snapshot. 367 """ 368 created_ts = now_timestamp() 369 370 audits = audits or {} 371 372 return cls( 373 name=model.name, 374 fingerprint=fingerprint_from_model( 375 model, 376 physical_schema=physical_schema, 377 models=models, 378 audits=audits, 379 cache=cache, 380 ), 381 physical_schema=physical_schema, 382 model=model, 383 parents=tuple( 384 SnapshotId( 385 name=name, 386 identifier=fingerprint_from_model( 387 models[name], 388 physical_schema=physical_schema, 389 models=models, 390 audits=audits, 391 cache=cache, 392 ).to_identifier(), 393 ) 394 for name in _parents_from_model(model, models) 395 ), 396 audits=tuple(model.referenced_audits(audits)), 397 intervals=[], 398 dev_intervals=[], 399 created_ts=created_ts, 400 updated_ts=created_ts, 401 ttl=ttl, 402 version=version, 403 ) 404 405 def __eq__(self, other: t.Any) -> bool: 406 return isinstance(other, Snapshot) and self.fingerprint == other.fingerprint 407 408 def __hash__(self) -> int: 409 return hash((self.__class__, self.fingerprint)) 410 411 def add_interval(self, start: TimeLike, end: TimeLike, is_dev: bool = False) -> None: 412 """Add a newly processed time interval to the snapshot. 413 414 The actual stored intervals are [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 415 timestamp exclusive. This allows merging of ranges to be easier. 416 417 Args: 418 start: The start date/time of the interval (inclusive) 419 end: The end date/time of the interval. If end is a date, then it is considered inclusive. 420 If it is a datetime object, then it is exclusive. 421 is_dev: Indicates whether the given interval is being added while in development mode. 422 """ 423 is_temp_table = self.is_temporary_table(is_dev) 424 intervals = self.dev_intervals if is_temp_table else self.intervals 425 426 intervals.append(self._inclusive_exclusive(start, end)) 427 428 if len(intervals) < 2: 429 return 430 431 merged_intervals = merge_intervals(intervals) 432 if is_temp_table: 433 self.dev_intervals = merged_intervals 434 else: 435 self.intervals = merged_intervals 436 437 def remove_interval(self, start: TimeLike, end: TimeLike) -> None: 438 """Remove an interval from the snapshot. 439 440 Args: 441 start: Start interval to remove. 442 end: End interval to remove. 443 """ 444 interval = self._inclusive_exclusive(start, end) 445 self.intervals = remove_interval(self.intervals, *interval) 446 self.dev_intervals = remove_interval(self.dev_intervals, *interval) 447 448 def _inclusive_exclusive(self, start: TimeLike, end: TimeLike) -> t.Tuple[int, int]: 449 start_dt, end_dt = make_inclusive(start, end) 450 start_ts = to_timestamp(self.model.cron_floor(start_dt)) 451 end_ts = to_timestamp(self.model.cron_next(end_dt)) 452 453 if start_ts >= end_ts: 454 raise ValueError("`end` must be >= `start`") 455 return (start_ts, end_ts) 456 457 def merge_intervals(self, other: Snapshot) -> None: 458 """Inherits intervals from the target snapshot. 459 460 Args: 461 other: The target snapshot to inherit intervals from. 462 """ 463 for start, end in other.intervals: 464 self.add_interval(start, end) 465 466 def missing_intervals( 467 self, start: TimeLike, end: TimeLike, latest: t.Optional[TimeLike] = None 468 ) -> Intervals: 469 """Find all missing intervals between [start, end]. 470 471 Although the inputs are inclusive, the returned stored intervals are 472 [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 473 timestamp exclusive. 474 475 Args: 476 start: The start date/time of the interval (inclusive) 477 end: The end date/time of the interval (inclusive) 478 479 Returns: 480 A list of all the missing intervals as epoch timestamps. 481 """ 482 if self.is_embedded_kind: 483 return [] 484 485 start_dt, end_dt = make_inclusive(start, self.model.cron_floor(end)) 486 487 if self.is_full_kind or self.is_view_kind or self.is_seed_kind: 488 latest_dt = to_datetime(self.model.cron_floor(latest or now())) 489 latest_ts = to_timestamp(latest_dt) 490 # if the latest ts is stored in the last interval, nothing is missing 491 # else returns the latest ts with the exclusive end ts. 492 if self.intervals and self.intervals[-1][1] >= latest_ts: 493 return [] 494 return [(to_timestamp(self.model.cron_prev(latest_dt)), latest_ts)] 495 496 missing = [] 497 dates = list(croniter_range(start_dt, end_dt, self.model.normalized_cron())) 498 size = len(dates) 499 500 for i in range(size): 501 current_ts = to_timestamp(dates[i]) 502 end_ts = ( 503 to_timestamp(dates[i + 1]) 504 if i + 1 < size 505 else to_timestamp(self.model.cron_next(current_ts)) 506 ) 507 508 for low, high in self.intervals: 509 if current_ts < low: 510 missing.append((current_ts, end_ts)) 511 break 512 elif current_ts < high: 513 break 514 else: 515 missing.append((current_ts, end_ts)) 516 517 return missing 518 519 def set_version( 520 self, 521 version: t.Optional[str | SnapshotDataVersion | SnapshotTableInfo | Snapshot] = None, 522 ) -> None: 523 """Set the version of this snapshot. 524 525 If no version is passed, the fingerprint of the snapshot will be used. 526 527 Args: 528 version: Either a string or a TableInfo to use. 529 """ 530 if isinstance(version, (SnapshotDataVersion, SnapshotTableInfo, Snapshot)): 531 self.version = version.data_version.version 532 else: 533 self.version = version or self.fingerprint.to_version() 534 535 def set_unpaused_ts(self, unpaused_dt: t.Optional[TimeLike]) -> None: 536 """Sets the timestamp for when this snapshot was unpaused. 537 538 Args: 539 unpaused_dt: The datetime object of when this snapshot was unpaused. 540 """ 541 self.unpaused_ts = to_timestamp(self.model.cron_floor(unpaused_dt)) if unpaused_dt else None 542 543 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 544 """Full table name pointing to the materialized location of the snapshot. 545 546 Args: 547 is_dev: Whether the table name will be used in development mode. 548 for_read: Whether the table name will be used for reading by a different snapshot. 549 """ 550 self._ensure_version() 551 assert self.version 552 return self._table_name(self.version, is_dev, for_read) 553 554 def table_name_for_mapping(self, is_dev: bool = False) -> str: 555 """Full table name used by a child snapshot for table mapping during evaluation. 556 557 Args: 558 is_dev: Whether the table name will be used in development mode. 559 """ 560 self._ensure_version() 561 assert self.version 562 563 if is_dev and self.is_forward_only: 564 # If this snapshot is unpaused we shouldn't be using a temporary 565 # table for mapping purposes. 566 is_dev = self.is_paused 567 568 return self._table_name(self.version, is_dev, True) 569 570 def version_get_or_generate(self) -> str: 571 """Helper method to get the version or generate it from the fingerprint.""" 572 return self.version or self.fingerprint.to_version() 573 574 @property 575 def table_info(self) -> SnapshotTableInfo: 576 """Helper method to get the SnapshotTableInfo from the Snapshot.""" 577 self._ensure_version() 578 return SnapshotTableInfo( 579 physical_schema=self.physical_schema, 580 name=self.name, 581 fingerprint=self.fingerprint, 582 version=self.version, 583 parents=self.parents, 584 previous_versions=self.previous_versions, 585 change_category=self.change_category, 586 is_materialized=self.is_materialized, 587 is_embedded_kind=self.is_embedded_kind, 588 ) 589 590 @property 591 def data_version(self) -> SnapshotDataVersion: 592 self._ensure_version() 593 return SnapshotDataVersion( 594 fingerprint=self.fingerprint, 595 version=self.version, 596 change_category=self.change_category, 597 ) 598 599 @property 600 def is_new_version(self) -> bool: 601 """Returns whether or not this version is new and requires a backfill.""" 602 self._ensure_version() 603 return self.fingerprint.to_version() == self.version 604 605 @property 606 def is_full_kind(self) -> bool: 607 return self.model.kind.is_full 608 609 @property 610 def is_view_kind(self) -> bool: 611 return self.model.kind.is_view 612 613 @property 614 def is_incremental_by_time_range_kind(self) -> bool: 615 return self.model.kind.is_incremental_by_time_range 616 617 @property 618 def is_incremental_by_unique_key_kind(self) -> bool: 619 return self.model.kind.is_incremental_by_unique_key 620 621 # @property 622 # def is_snapshot_kind(self) -> bool: 623 # return self.model.kind.is_snapshot 624 625 @property 626 def is_embedded_kind(self) -> bool: 627 return self.model.kind.is_embedded 628 629 @property 630 def is_seed_kind(self) -> bool: 631 return self.model.kind.is_seed 632 633 @property 634 def is_materialized(self) -> bool: 635 return self.model.kind.is_materialized 636 637 @property 638 def is_paused(self) -> bool: 639 return self.unpaused_ts is None 640 641 def _ensure_version(self) -> None: 642 if not self.version: 643 raise SQLMeshError(f"Snapshot {self.snapshot_id} has not been versioned yet.") 644 645 646SnapshotIdLike = t.Union[SnapshotId, SnapshotTableInfo, Snapshot] 647SnapshotInfoLike = t.Union[SnapshotTableInfo, Snapshot] 648SnapshotNameVersionLike = t.Union[SnapshotNameVersion, SnapshotTableInfo, Snapshot] 649 650 651def table_name(physical_schema: str, name: str, version: str, is_temp: bool = False) -> str: 652 temp_suffx = "__temp" if is_temp else "" 653 return f"{physical_schema}.{name.replace('.', '__')}__{version}{temp_suffx}" 654 655 656def fingerprint_from_model( 657 model: Model, 658 *, 659 models: t.Dict[str, Model], 660 physical_schema: str = "", 661 audits: t.Optional[t.Dict[str, Audit]] = None, 662 cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, 663) -> SnapshotFingerprint: 664 """Helper function to generate a fingerprint based on a model's query and environment. 665 666 This method tries to remove non meaningful differences to avoid ever changing fingerprints. 667 The fingerprint is made up of two parts split by an underscore -- query_metadata. The query hash is 668 determined purely by the rendered query and the metadata by everything else. 669 670 Args: 671 model: Model to fingerprint. 672 physical_schema: The physical_schema of the snapshot which represents where it is stored. 673 models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. 674 If no dictionary is passed in the fingerprint will not be dependent on a model's parents. 675 audits: Available audits by name. 676 cache: Cache of model name to fingerprints. 677 678 Returns: 679 The fingerprint. 680 """ 681 cache = {} if cache is None else cache 682 683 if model.name not in cache: 684 parents = [ 685 fingerprint_from_model( 686 models[table], 687 models=models, 688 physical_schema=physical_schema, 689 audits=audits, 690 cache=cache, 691 ) 692 for table in model.depends_on 693 if table in models 694 ] 695 696 parent_data_hash = _hash(sorted(p.to_version() for p in parents)) 697 698 parent_metadata_hash = _hash( 699 sorted(h for p in parents for h in (p.metadata_hash, p.parent_metadata_hash)) 700 ) 701 702 cache[model.name] = SnapshotFingerprint( 703 data_hash=_model_data_hash(model, physical_schema), 704 metadata_hash=_model_metadata_hash(model, audits or {}), 705 parent_data_hash=parent_data_hash, 706 parent_metadata_hash=parent_metadata_hash, 707 ) 708 709 return cache[model.name] 710 711 712def _model_data_hash(model: Model, physical_schema: str) -> str: 713 def serialize_hooks(hooks: t.List[HookCall]) -> t.Iterable[str]: 714 serialized = [] 715 for hook in hooks: 716 if isinstance(hook, exp.Expression): 717 serialized.append(hook.sql()) 718 else: 719 name, args = hook 720 serialized.append( 721 f"{name}:" 722 + ",".join( 723 f"{k}={v.sql(identify=True, comments=False)}" 724 for k, v in sorted(args.items()) 725 ) 726 ) 727 return serialized 728 729 data = [ 730 str(model.sorted_python_env), 731 model.kind.name, 732 model.cron, 733 model.storage_format, 734 physical_schema, 735 *(model.partitioned_by or []), 736 *(expression.sql(identify=True, comments=False) for expression in model.expressions or []), 737 *serialize_hooks(model.pre), 738 *serialize_hooks(model.post), 739 model.stamp, 740 ] 741 742 if isinstance(model, SqlModel): 743 data.append(model.query.sql(identify=True, comments=False)) 744 745 for macro_name, macro in sorted(model.jinja_macros.root_macros.items(), key=lambda x: x[0]): 746 data.append(macro_name) 747 data.append(macro.definition) 748 749 for package in model.jinja_macros.packages.values(): 750 for macro_name, macro in sorted(package.items(), key=lambda x: x[0]): 751 data.append(macro_name) 752 data.append(macro.definition) 753 elif isinstance(model, PythonModel): 754 data.append(model.entrypoint) 755 for column_name, column_type in model.columns_to_types.items(): 756 data.append(column_name) 757 data.append(str(column_type)) 758 elif isinstance(model, SeedModel): 759 data.append(str(model.kind.batch_size)) 760 data.append(model.seed.content) 761 for column_name, column_type in (model.columns_to_types_ or {}).items(): 762 data.append(column_name) 763 data.append(column_type.sql()) 764 765 if isinstance(model.kind, kind.IncrementalByTimeRangeKind): 766 data.append(model.kind.time_column.column) 767 data.append(model.kind.time_column.format) 768 elif isinstance(model.kind, kind.IncrementalByUniqueKeyKind): 769 data.extend(model.kind.unique_key) 770 771 return _hash(data) 772 773 774def _model_metadata_hash(model: Model, audits: t.Dict[str, Audit]) -> str: 775 metadata = [ 776 model.dialect, 777 model.owner, 778 model.description, 779 str(to_timestamp(model.start)) if model.start else None, 780 str(model.batch_size) if model.batch_size is not None else None, 781 ] 782 783 for audit_name, audit_args in sorted(model.audits, key=lambda a: a[0]): 784 if audit_name not in audits: 785 continue 786 787 audit = audits[audit_name] 788 metadata.extend( 789 [ 790 audit.name, 791 audit.render_query(model, **t.cast(t.Dict[str, t.Any], audit_args)).sql( 792 identify=True, comments=True 793 ), 794 audit.dialect, 795 str(audit.skip), 796 str(audit.blocking), 797 ] 798 ) 799 800 # Add comments from the model query. 801 for e, _, _ in model.render_query().walk(): 802 if e.comments: 803 metadata.extend(e.comments) 804 805 return _hash(metadata) 806 807 808def _hash(data: t.Iterable[t.Optional[str]]) -> str: 809 return str(zlib.crc32(";".join("" if d is None else d for d in data).encode("utf-8"))) 810 811 812def _parents_from_model( 813 model: Model, 814 models: t.Dict[str, Model], 815) -> t.Set[str]: 816 parent_tables = set() 817 for table in model.depends_on: 818 if table in models: 819 parent_tables.add(table) 820 if models[table].kind.is_embedded: 821 parent_tables.update(_parents_from_model(models[table], models)) 822 823 return parent_tables 824 825 826def merge_intervals(intervals: Intervals) -> Intervals: 827 """Merge a list of intervals. 828 829 Args: 830 intervals: A list of intervals to merge together. 831 832 Returns: 833 A new list of sorted and merged intervals. 834 """ 835 intervals = sorted(intervals) 836 837 merged = [intervals[0]] 838 839 for interval in intervals[1:]: 840 current = merged[-1] 841 842 if interval[0] <= current[1]: 843 merged[-1] = (current[0], max(current[1], interval[1])) 844 else: 845 merged.append(interval) 846 847 return merged 848 849 850def remove_interval(intervals: Intervals, remove_start: int, remove_end: int) -> Intervals: 851 """Remove an interval from a list of intervals. 852 853 Args: 854 intervals: A list of exclusive intervals. 855 remove_start: The inclusive start to remove. 856 remove_end: The exclusive end to remove. 857 858 Returns: 859 A new list of intervals. 860 """ 861 modified: Intervals = [] 862 863 for start, end in intervals: 864 if remove_start > start and remove_end < end: 865 modified.extend( 866 ( 867 (start, remove_start), 868 (remove_end, end), 869 ) 870 ) 871 elif remove_start > start: 872 modified.append((start, min(remove_start, end))) 873 elif remove_end < end: 874 modified.append((max(remove_end, start), end)) 875 876 return modified 877 878 879def to_table_mapping(snapshots: t.Iterable[Snapshot], is_dev: bool) -> t.Dict[str, str]: 880 return { 881 snapshot.name: snapshot.table_name_for_mapping(is_dev=is_dev) 882 for snapshot in snapshots 883 if snapshot.version and not snapshot.is_embedded_kind 884 }
39class SnapshotChangeCategory(IntEnum): 40 """ 41 Values are ordered by decreasing severity and that ordering is required. 42 43 BREAKING: The change requires that snapshot modified and downstream dependencies be rebuilt 44 NON_BREAKING: The change requires that only the snapshot modified be rebuilt 45 NO_CHANGE: The change requires no rebuilding 46 """ 47 48 BREAKING = 1 49 NON_BREAKING = 2 50 FORWARD_ONLY = 3
Values are ordered by decreasing severity and that ordering is required.
BREAKING: The change requires that snapshot modified and downstream dependencies be rebuilt NON_BREAKING: The change requires that only the snapshot modified be rebuilt NO_CHANGE: The change requires no rebuilding
Inherited Members
- enum.Enum
- name
- value
- builtins.int
- conjugate
- bit_length
- bit_count
- to_bytes
- from_bytes
- as_integer_ratio
- real
- imag
- numerator
- denominator
53class SnapshotFingerprint(PydanticModel, frozen=True): 54 data_hash: str 55 metadata_hash: str 56 parent_data_hash: str = "0" 57 parent_metadata_hash: str = "0" 58 59 def to_version(self) -> str: 60 return _hash([self.data_hash, self.parent_data_hash]) 61 62 def to_identifier(self) -> str: 63 return _hash( 64 [ 65 self.data_hash, 66 self.metadata_hash, 67 self.parent_data_hash, 68 self.parent_metadata_hash, 69 ] 70 )
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
73class SnapshotId(PydanticModel, frozen=True): 74 name: str 75 identifier: str 76 77 @property 78 def snapshot_id(self) -> SnapshotId: 79 """Helper method to return self.""" 80 return self
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
88class SnapshotDataVersion(PydanticModel, frozen=True): 89 fingerprint: SnapshotFingerprint 90 version: str 91 change_category: t.Optional[SnapshotChangeCategory] 92 93 @property 94 def data_version(self) -> SnapshotDataVersion: 95 return self 96 97 @property 98 def is_new_version(self) -> bool: 99 """Returns whether or not this version is new and requires a backfill.""" 100 return self.fingerprint.to_version() == self.version
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
103class QualifiedViewName(PydanticModel, frozen=True): 104 catalog: t.Optional[str] 105 schema_name: t.Optional[str] 106 table: str 107 108 def for_environment(self, environment: str) -> str: 109 return ".".join( 110 p 111 for p in ( 112 self.catalog, 113 self.schema_for_environment(environment), 114 self.table, 115 ) 116 if p is not None 117 ) 118 119 def schema_for_environment(self, environment: str) -> str: 120 schema = self.schema_name or "default" 121 if environment.lower() != c.PROD: 122 schema = f"{schema}__{environment}" 123 return schema
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
126class SnapshotInfoMixin: 127 name: str 128 fingerprint: SnapshotFingerprint 129 physical_schema: str 130 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 131 132 def is_temporary_table(self, is_dev: bool) -> bool: 133 """Provided whether the snapshot is used in a development mode or not, returns True 134 if the snapshot targets a temporary table or a clone and False otherwise. 135 """ 136 return is_dev and not self.is_new_version 137 138 @property 139 def identifier(self) -> str: 140 return self.fingerprint.to_identifier() 141 142 @property 143 def snapshot_id(self) -> SnapshotId: 144 return SnapshotId(name=self.name, identifier=self.identifier) 145 146 @property 147 def qualified_view_name(self) -> QualifiedViewName: 148 (catalog, schema, table) = parse_model_name(self.name) 149 return QualifiedViewName(catalog=catalog, schema_name=schema, table=table) 150 151 @property 152 def previous_version(self) -> t.Optional[SnapshotDataVersion]: 153 """Helper method to get the previous data version.""" 154 if self.previous_versions: 155 return self.previous_versions[-1] 156 return None 157 158 @property 159 def data_version(self) -> SnapshotDataVersion: 160 raise NotImplementedError 161 162 @property 163 def is_new_version(self) -> bool: 164 raise NotImplementedError 165 166 @property 167 def is_forward_only(self) -> bool: 168 return not self.data_hash_matches(self.previous_version) and not self.is_new_version 169 170 @property 171 def all_versions(self) -> t.Tuple[SnapshotDataVersion, ...]: 172 """Returns previous versions with the current version trimmed to DATA_VERSION_LIMIT.""" 173 return (*self.previous_versions, self.data_version)[-c.DATA_VERSION_LIMIT :] 174 175 def data_hash_matches(self, other: t.Optional[SnapshotInfoMixin | SnapshotDataVersion]) -> bool: 176 return other is not None and self.fingerprint.data_hash == other.fingerprint.data_hash 177 178 def _table_name(self, version: str, is_dev: bool, for_read: bool) -> str: 179 """Full table name pointing to the materialized location of the snapshot. 180 181 Args: 182 version: The snapshot version. 183 is_dev: Whether the table name will be used in development mode. 184 for_read: Whether the table name will be used for reading by a different snapshot. 185 """ 186 if is_dev and for_read: 187 # If this snapshot is used for reading, return a temporary table 188 # only if this snapshot captures direct changes applied to its model. 189 version = self.fingerprint.to_version() if self.is_forward_only else version 190 is_temp = self.is_temporary_table(True) and self.is_forward_only 191 elif is_dev: 192 version = self.fingerprint.to_version() 193 is_temp = self.is_temporary_table(True) 194 else: 195 is_temp = False 196 197 return table_name( 198 self.physical_schema, 199 self.name, 200 version, 201 is_temp=is_temp, 202 )
132 def is_temporary_table(self, is_dev: bool) -> bool: 133 """Provided whether the snapshot is used in a development mode or not, returns True 134 if the snapshot targets a temporary table or a clone and False otherwise. 135 """ 136 return is_dev and not self.is_new_version
Provided whether the snapshot is used in a development mode or not, returns True if the snapshot targets a temporary table or a clone and False otherwise.
Helper method to get the previous data version.
Returns previous versions with the current version trimmed to DATA_VERSION_LIMIT.
205class SnapshotTableInfo(PydanticModel, SnapshotInfoMixin, frozen=True): 206 name: str 207 fingerprint: SnapshotFingerprint 208 version: str 209 physical_schema: str 210 parents: t.Tuple[SnapshotId, ...] 211 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 212 change_category: t.Optional[SnapshotChangeCategory] 213 is_materialized: bool 214 is_embedded_kind: bool 215 216 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 217 """Full table name pointing to the materialized location of the snapshot. 218 219 Args: 220 is_dev: Whether the table name will be used in development mode. 221 for_read: Whether the table name will be used for reading by a different snapshot. 222 """ 223 return self._table_name(self.version, is_dev, for_read) 224 225 @property 226 def table_info(self) -> SnapshotTableInfo: 227 """Helper method to return self.""" 228 return self 229 230 @property 231 def data_version(self) -> SnapshotDataVersion: 232 return SnapshotDataVersion( 233 fingerprint=self.fingerprint, 234 version=self.version, 235 change_category=self.change_category, 236 ) 237 238 @property 239 def is_new_version(self) -> bool: 240 """Returns whether or not this version is new and requires a backfill.""" 241 return self.fingerprint.to_version() == self.version
216 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 217 """Full table name pointing to the materialized location of the snapshot. 218 219 Args: 220 is_dev: Whether the table name will be used in development mode. 221 for_read: Whether the table name will be used for reading by a different snapshot. 222 """ 223 return self._table_name(self.version, is_dev, for_read)
Full table name pointing to the materialized location of the snapshot.
Arguments:
- is_dev: Whether the table name will be used in development mode.
- for_read: Whether the table name will be used for reading by a different snapshot.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
244class Snapshot(PydanticModel, SnapshotInfoMixin): 245 """A snapshot represents a model at a certain point in time. 246 247 Snapshots are used to encapsulate everything needed to evaluate a model. 248 They are standalone objects that hold all state and dynamic content necessary 249 to render a model's query including things like macros. Snapshots also store intervals 250 (timestamp ranges for what data we've processed). 251 252 Models can be dynamically rendered due to macros. Rendering a model to its full extent 253 requires storing variables and macro definitions. We store all of the macro definitions and 254 global variable references in `python_env` in raw text to avoid pickling. The helper methods 255 to achieve this are defined in utils.metaprogramming. 256 257 Args: 258 name: The snapshot name which is the same as the model name and should be unique per model. 259 260 fingerprint: A unique hash of the model definition so that models can be reused across environments. 261 physical_schema: The physical schema that the snapshot is stored in. 262 model: Model object that the snapshot encapsulates. 263 parents: The list of parent snapshots (upstream dependencies). 264 audits: The list of audits used by the model. 265 intervals: List of [start, end) intervals showing which time ranges a snapshot has data for. 266 created_ts: Epoch millis timestamp when a snapshot was first created. 267 updated_ts: Epoch millis timestamp when a snapshot was last updated. 268 ttl: The time-to-live of a snapshot determines when it should be deleted after it's no longer referenced 269 in any environment. 270 previous: The snapshot data version that this snapshot was based on. If this snapshot is new, then previous will be None. 271 version: User specified version for a snapshot that is used for physical storage. 272 By default, the version is the fingerprint, but not all changes to models require a backfill. 273 If a user passes a previous version, that will be used instead and no backfill will be required. 274 change_category: User specified change category indicating which models require backfill from model changes made in this snapshot. 275 unpaused_ts: The timestamp which indicates when this snapshot was unpaused. Unpaused means that 276 this snapshot is evaluated on a recurring basis. None indicates that this snapshot is paused. 277 """ 278 279 name: str 280 fingerprint: SnapshotFingerprint 281 physical_schema: str 282 model: Model 283 parents: t.Tuple[SnapshotId, ...] 284 audits: t.Tuple[Audit, ...] 285 intervals: Intervals 286 dev_intervals: Intervals 287 created_ts: int 288 updated_ts: int 289 ttl: str 290 previous_versions: t.Tuple[SnapshotDataVersion, ...] = () 291 indirect_versions: t.Dict[str, t.Tuple[SnapshotDataVersion, ...]] = {} 292 version: t.Optional[str] = None 293 change_category: t.Optional[SnapshotChangeCategory] = None 294 unpaused_ts: t.Optional[int] = None 295 296 @validator("ttl") 297 @classmethod 298 def _time_delta_must_be_positive(cls, v: str) -> str: 299 current_time = now() 300 if to_datetime(v, current_time) < current_time: 301 raise ValueError( 302 "Must be positive. Use the 'in' keyword to denote a positive time interval. For example, 'in 7 days'." 303 ) 304 return v 305 306 @staticmethod 307 def merge_snapshots( 308 targets: t.Iterable[SnapshotIdLike], 309 snapshots: t.Dict[SnapshotId, Snapshot], 310 ) -> t.List[Snapshot]: 311 """Merge target snapshots with others so that each target snapshot has intervals from all other snapshots with the same version. 312 313 Args: 314 targets: Iterable of snapshot-like objects 315 snapshots: Dictionary of snapshot ids to snapshot. 316 317 Returns: 318 List of target snapshots with merged intervals. 319 """ 320 merged = [] 321 snapshots_by_name_version = defaultdict(list) 322 323 for s in snapshots.values(): 324 snapshots_by_name_version[(s.name, s.version)].append(s) 325 326 for snapshot_like in targets: 327 snapshot_id = snapshot_like.snapshot_id 328 snapshot = snapshots.get(snapshot_id) 329 if not snapshot: 330 raise SQLMeshError(f"The snapshot {snapshot_id} was not found") 331 332 snapshot = snapshot.copy() 333 snapshot.intervals = [] 334 335 for other in snapshots_by_name_version[(snapshot.name, snapshot.version)]: 336 snapshot.merge_intervals(other) 337 338 merged.append(snapshot) 339 340 return merged 341 342 @classmethod 343 def from_model( 344 cls, 345 model: Model, 346 *, 347 physical_schema: str, 348 models: t.Dict[str, Model], 349 ttl: str = c.DEFAULT_SNAPSHOT_TTL, 350 version: t.Optional[str] = None, 351 audits: t.Optional[t.Dict[str, Audit]] = None, 352 cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, 353 ) -> Snapshot: 354 """Creates a new snapshot for a model. 355 356 Args: 357 model: Model to snapshot. 358 physical_schema: The schema of the snapshot which represents where it is stored. 359 models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. 360 If no dictionary is passed in the fingerprint will not be dependent on a model's parents. 361 ttl: A TTL to determine how long orphaned (snapshots that are not promoted anywhere) should live. 362 version: The version that a snapshot is associated with. Usually set during the planning phase. 363 audits: Available audits by name. 364 cache: Cache of model name to fingerprints. 365 366 Returns: 367 The newly created snapshot. 368 """ 369 created_ts = now_timestamp() 370 371 audits = audits or {} 372 373 return cls( 374 name=model.name, 375 fingerprint=fingerprint_from_model( 376 model, 377 physical_schema=physical_schema, 378 models=models, 379 audits=audits, 380 cache=cache, 381 ), 382 physical_schema=physical_schema, 383 model=model, 384 parents=tuple( 385 SnapshotId( 386 name=name, 387 identifier=fingerprint_from_model( 388 models[name], 389 physical_schema=physical_schema, 390 models=models, 391 audits=audits, 392 cache=cache, 393 ).to_identifier(), 394 ) 395 for name in _parents_from_model(model, models) 396 ), 397 audits=tuple(model.referenced_audits(audits)), 398 intervals=[], 399 dev_intervals=[], 400 created_ts=created_ts, 401 updated_ts=created_ts, 402 ttl=ttl, 403 version=version, 404 ) 405 406 def __eq__(self, other: t.Any) -> bool: 407 return isinstance(other, Snapshot) and self.fingerprint == other.fingerprint 408 409 def __hash__(self) -> int: 410 return hash((self.__class__, self.fingerprint)) 411 412 def add_interval(self, start: TimeLike, end: TimeLike, is_dev: bool = False) -> None: 413 """Add a newly processed time interval to the snapshot. 414 415 The actual stored intervals are [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 416 timestamp exclusive. This allows merging of ranges to be easier. 417 418 Args: 419 start: The start date/time of the interval (inclusive) 420 end: The end date/time of the interval. If end is a date, then it is considered inclusive. 421 If it is a datetime object, then it is exclusive. 422 is_dev: Indicates whether the given interval is being added while in development mode. 423 """ 424 is_temp_table = self.is_temporary_table(is_dev) 425 intervals = self.dev_intervals if is_temp_table else self.intervals 426 427 intervals.append(self._inclusive_exclusive(start, end)) 428 429 if len(intervals) < 2: 430 return 431 432 merged_intervals = merge_intervals(intervals) 433 if is_temp_table: 434 self.dev_intervals = merged_intervals 435 else: 436 self.intervals = merged_intervals 437 438 def remove_interval(self, start: TimeLike, end: TimeLike) -> None: 439 """Remove an interval from the snapshot. 440 441 Args: 442 start: Start interval to remove. 443 end: End interval to remove. 444 """ 445 interval = self._inclusive_exclusive(start, end) 446 self.intervals = remove_interval(self.intervals, *interval) 447 self.dev_intervals = remove_interval(self.dev_intervals, *interval) 448 449 def _inclusive_exclusive(self, start: TimeLike, end: TimeLike) -> t.Tuple[int, int]: 450 start_dt, end_dt = make_inclusive(start, end) 451 start_ts = to_timestamp(self.model.cron_floor(start_dt)) 452 end_ts = to_timestamp(self.model.cron_next(end_dt)) 453 454 if start_ts >= end_ts: 455 raise ValueError("`end` must be >= `start`") 456 return (start_ts, end_ts) 457 458 def merge_intervals(self, other: Snapshot) -> None: 459 """Inherits intervals from the target snapshot. 460 461 Args: 462 other: The target snapshot to inherit intervals from. 463 """ 464 for start, end in other.intervals: 465 self.add_interval(start, end) 466 467 def missing_intervals( 468 self, start: TimeLike, end: TimeLike, latest: t.Optional[TimeLike] = None 469 ) -> Intervals: 470 """Find all missing intervals between [start, end]. 471 472 Although the inputs are inclusive, the returned stored intervals are 473 [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 474 timestamp exclusive. 475 476 Args: 477 start: The start date/time of the interval (inclusive) 478 end: The end date/time of the interval (inclusive) 479 480 Returns: 481 A list of all the missing intervals as epoch timestamps. 482 """ 483 if self.is_embedded_kind: 484 return [] 485 486 start_dt, end_dt = make_inclusive(start, self.model.cron_floor(end)) 487 488 if self.is_full_kind or self.is_view_kind or self.is_seed_kind: 489 latest_dt = to_datetime(self.model.cron_floor(latest or now())) 490 latest_ts = to_timestamp(latest_dt) 491 # if the latest ts is stored in the last interval, nothing is missing 492 # else returns the latest ts with the exclusive end ts. 493 if self.intervals and self.intervals[-1][1] >= latest_ts: 494 return [] 495 return [(to_timestamp(self.model.cron_prev(latest_dt)), latest_ts)] 496 497 missing = [] 498 dates = list(croniter_range(start_dt, end_dt, self.model.normalized_cron())) 499 size = len(dates) 500 501 for i in range(size): 502 current_ts = to_timestamp(dates[i]) 503 end_ts = ( 504 to_timestamp(dates[i + 1]) 505 if i + 1 < size 506 else to_timestamp(self.model.cron_next(current_ts)) 507 ) 508 509 for low, high in self.intervals: 510 if current_ts < low: 511 missing.append((current_ts, end_ts)) 512 break 513 elif current_ts < high: 514 break 515 else: 516 missing.append((current_ts, end_ts)) 517 518 return missing 519 520 def set_version( 521 self, 522 version: t.Optional[str | SnapshotDataVersion | SnapshotTableInfo | Snapshot] = None, 523 ) -> None: 524 """Set the version of this snapshot. 525 526 If no version is passed, the fingerprint of the snapshot will be used. 527 528 Args: 529 version: Either a string or a TableInfo to use. 530 """ 531 if isinstance(version, (SnapshotDataVersion, SnapshotTableInfo, Snapshot)): 532 self.version = version.data_version.version 533 else: 534 self.version = version or self.fingerprint.to_version() 535 536 def set_unpaused_ts(self, unpaused_dt: t.Optional[TimeLike]) -> None: 537 """Sets the timestamp for when this snapshot was unpaused. 538 539 Args: 540 unpaused_dt: The datetime object of when this snapshot was unpaused. 541 """ 542 self.unpaused_ts = to_timestamp(self.model.cron_floor(unpaused_dt)) if unpaused_dt else None 543 544 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 545 """Full table name pointing to the materialized location of the snapshot. 546 547 Args: 548 is_dev: Whether the table name will be used in development mode. 549 for_read: Whether the table name will be used for reading by a different snapshot. 550 """ 551 self._ensure_version() 552 assert self.version 553 return self._table_name(self.version, is_dev, for_read) 554 555 def table_name_for_mapping(self, is_dev: bool = False) -> str: 556 """Full table name used by a child snapshot for table mapping during evaluation. 557 558 Args: 559 is_dev: Whether the table name will be used in development mode. 560 """ 561 self._ensure_version() 562 assert self.version 563 564 if is_dev and self.is_forward_only: 565 # If this snapshot is unpaused we shouldn't be using a temporary 566 # table for mapping purposes. 567 is_dev = self.is_paused 568 569 return self._table_name(self.version, is_dev, True) 570 571 def version_get_or_generate(self) -> str: 572 """Helper method to get the version or generate it from the fingerprint.""" 573 return self.version or self.fingerprint.to_version() 574 575 @property 576 def table_info(self) -> SnapshotTableInfo: 577 """Helper method to get the SnapshotTableInfo from the Snapshot.""" 578 self._ensure_version() 579 return SnapshotTableInfo( 580 physical_schema=self.physical_schema, 581 name=self.name, 582 fingerprint=self.fingerprint, 583 version=self.version, 584 parents=self.parents, 585 previous_versions=self.previous_versions, 586 change_category=self.change_category, 587 is_materialized=self.is_materialized, 588 is_embedded_kind=self.is_embedded_kind, 589 ) 590 591 @property 592 def data_version(self) -> SnapshotDataVersion: 593 self._ensure_version() 594 return SnapshotDataVersion( 595 fingerprint=self.fingerprint, 596 version=self.version, 597 change_category=self.change_category, 598 ) 599 600 @property 601 def is_new_version(self) -> bool: 602 """Returns whether or not this version is new and requires a backfill.""" 603 self._ensure_version() 604 return self.fingerprint.to_version() == self.version 605 606 @property 607 def is_full_kind(self) -> bool: 608 return self.model.kind.is_full 609 610 @property 611 def is_view_kind(self) -> bool: 612 return self.model.kind.is_view 613 614 @property 615 def is_incremental_by_time_range_kind(self) -> bool: 616 return self.model.kind.is_incremental_by_time_range 617 618 @property 619 def is_incremental_by_unique_key_kind(self) -> bool: 620 return self.model.kind.is_incremental_by_unique_key 621 622 # @property 623 # def is_snapshot_kind(self) -> bool: 624 # return self.model.kind.is_snapshot 625 626 @property 627 def is_embedded_kind(self) -> bool: 628 return self.model.kind.is_embedded 629 630 @property 631 def is_seed_kind(self) -> bool: 632 return self.model.kind.is_seed 633 634 @property 635 def is_materialized(self) -> bool: 636 return self.model.kind.is_materialized 637 638 @property 639 def is_paused(self) -> bool: 640 return self.unpaused_ts is None 641 642 def _ensure_version(self) -> None: 643 if not self.version: 644 raise SQLMeshError(f"Snapshot {self.snapshot_id} has not been versioned yet.")
A snapshot represents a model at a certain point in time.
Snapshots are used to encapsulate everything needed to evaluate a model. They are standalone objects that hold all state and dynamic content necessary to render a model's query including things like macros. Snapshots also store intervals (timestamp ranges for what data we've processed).
Models can be dynamically rendered due to macros. Rendering a model to its full extent
requires storing variables and macro definitions. We store all of the macro definitions and
global variable references in python_env
in raw text to avoid pickling. The helper methods
to achieve this are defined in utils.metaprogramming.
Arguments:
- name: The snapshot name which is the same as the model name and should be unique per model.
- fingerprint: A unique hash of the model definition so that models can be reused across environments.
- physical_schema: The physical schema that the snapshot is stored in.
- model: Model object that the snapshot encapsulates.
- parents: The list of parent snapshots (upstream dependencies).
- audits: The list of audits used by the model.
- intervals: List of [start, end) intervals showing which time ranges a snapshot has data for.
- created_ts: Epoch millis timestamp when a snapshot was first created.
- updated_ts: Epoch millis timestamp when a snapshot was last updated.
- ttl: The time-to-live of a snapshot determines when it should be deleted after it's no longer referenced in any environment.
- previous: The snapshot data version that this snapshot was based on. If this snapshot is new, then previous will be None.
- version: User specified version for a snapshot that is used for physical storage. By default, the version is the fingerprint, but not all changes to models require a backfill. If a user passes a previous version, that will be used instead and no backfill will be required.
- change_category: User specified change category indicating which models require backfill from model changes made in this snapshot.
- unpaused_ts: The timestamp which indicates when this snapshot was unpaused. Unpaused means that this snapshot is evaluated on a recurring basis. None indicates that this snapshot is paused.
306 @staticmethod 307 def merge_snapshots( 308 targets: t.Iterable[SnapshotIdLike], 309 snapshots: t.Dict[SnapshotId, Snapshot], 310 ) -> t.List[Snapshot]: 311 """Merge target snapshots with others so that each target snapshot has intervals from all other snapshots with the same version. 312 313 Args: 314 targets: Iterable of snapshot-like objects 315 snapshots: Dictionary of snapshot ids to snapshot. 316 317 Returns: 318 List of target snapshots with merged intervals. 319 """ 320 merged = [] 321 snapshots_by_name_version = defaultdict(list) 322 323 for s in snapshots.values(): 324 snapshots_by_name_version[(s.name, s.version)].append(s) 325 326 for snapshot_like in targets: 327 snapshot_id = snapshot_like.snapshot_id 328 snapshot = snapshots.get(snapshot_id) 329 if not snapshot: 330 raise SQLMeshError(f"The snapshot {snapshot_id} was not found") 331 332 snapshot = snapshot.copy() 333 snapshot.intervals = [] 334 335 for other in snapshots_by_name_version[(snapshot.name, snapshot.version)]: 336 snapshot.merge_intervals(other) 337 338 merged.append(snapshot) 339 340 return merged
Merge target snapshots with others so that each target snapshot has intervals from all other snapshots with the same version.
Arguments:
- targets: Iterable of snapshot-like objects
- snapshots: Dictionary of snapshot ids to snapshot.
Returns:
List of target snapshots with merged intervals.
342 @classmethod 343 def from_model( 344 cls, 345 model: Model, 346 *, 347 physical_schema: str, 348 models: t.Dict[str, Model], 349 ttl: str = c.DEFAULT_SNAPSHOT_TTL, 350 version: t.Optional[str] = None, 351 audits: t.Optional[t.Dict[str, Audit]] = None, 352 cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, 353 ) -> Snapshot: 354 """Creates a new snapshot for a model. 355 356 Args: 357 model: Model to snapshot. 358 physical_schema: The schema of the snapshot which represents where it is stored. 359 models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. 360 If no dictionary is passed in the fingerprint will not be dependent on a model's parents. 361 ttl: A TTL to determine how long orphaned (snapshots that are not promoted anywhere) should live. 362 version: The version that a snapshot is associated with. Usually set during the planning phase. 363 audits: Available audits by name. 364 cache: Cache of model name to fingerprints. 365 366 Returns: 367 The newly created snapshot. 368 """ 369 created_ts = now_timestamp() 370 371 audits = audits or {} 372 373 return cls( 374 name=model.name, 375 fingerprint=fingerprint_from_model( 376 model, 377 physical_schema=physical_schema, 378 models=models, 379 audits=audits, 380 cache=cache, 381 ), 382 physical_schema=physical_schema, 383 model=model, 384 parents=tuple( 385 SnapshotId( 386 name=name, 387 identifier=fingerprint_from_model( 388 models[name], 389 physical_schema=physical_schema, 390 models=models, 391 audits=audits, 392 cache=cache, 393 ).to_identifier(), 394 ) 395 for name in _parents_from_model(model, models) 396 ), 397 audits=tuple(model.referenced_audits(audits)), 398 intervals=[], 399 dev_intervals=[], 400 created_ts=created_ts, 401 updated_ts=created_ts, 402 ttl=ttl, 403 version=version, 404 )
Creates a new snapshot for a model.
Arguments:
- model: Model to snapshot.
- physical_schema: The schema of the snapshot which represents where it is stored.
- models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. If no dictionary is passed in the fingerprint will not be dependent on a model's parents.
- ttl: A TTL to determine how long orphaned (snapshots that are not promoted anywhere) should live.
- version: The version that a snapshot is associated with. Usually set during the planning phase.
- audits: Available audits by name.
- cache: Cache of model name to fingerprints.
Returns:
The newly created snapshot.
412 def add_interval(self, start: TimeLike, end: TimeLike, is_dev: bool = False) -> None: 413 """Add a newly processed time interval to the snapshot. 414 415 The actual stored intervals are [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 416 timestamp exclusive. This allows merging of ranges to be easier. 417 418 Args: 419 start: The start date/time of the interval (inclusive) 420 end: The end date/time of the interval. If end is a date, then it is considered inclusive. 421 If it is a datetime object, then it is exclusive. 422 is_dev: Indicates whether the given interval is being added while in development mode. 423 """ 424 is_temp_table = self.is_temporary_table(is_dev) 425 intervals = self.dev_intervals if is_temp_table else self.intervals 426 427 intervals.append(self._inclusive_exclusive(start, end)) 428 429 if len(intervals) < 2: 430 return 431 432 merged_intervals = merge_intervals(intervals) 433 if is_temp_table: 434 self.dev_intervals = merged_intervals 435 else: 436 self.intervals = merged_intervals
Add a newly processed time interval to the snapshot.
The actual stored intervals are [start_ts, end_ts) or start epoch timestamp inclusive and end epoch timestamp exclusive. This allows merging of ranges to be easier.
Arguments:
- start: The start date/time of the interval (inclusive)
- end: The end date/time of the interval. If end is a date, then it is considered inclusive. If it is a datetime object, then it is exclusive.
- is_dev: Indicates whether the given interval is being added while in development mode.
438 def remove_interval(self, start: TimeLike, end: TimeLike) -> None: 439 """Remove an interval from the snapshot. 440 441 Args: 442 start: Start interval to remove. 443 end: End interval to remove. 444 """ 445 interval = self._inclusive_exclusive(start, end) 446 self.intervals = remove_interval(self.intervals, *interval) 447 self.dev_intervals = remove_interval(self.dev_intervals, *interval)
Remove an interval from the snapshot.
Arguments:
- start: Start interval to remove.
- end: End interval to remove.
458 def merge_intervals(self, other: Snapshot) -> None: 459 """Inherits intervals from the target snapshot. 460 461 Args: 462 other: The target snapshot to inherit intervals from. 463 """ 464 for start, end in other.intervals: 465 self.add_interval(start, end)
Inherits intervals from the target snapshot.
Arguments:
- other: The target snapshot to inherit intervals from.
467 def missing_intervals( 468 self, start: TimeLike, end: TimeLike, latest: t.Optional[TimeLike] = None 469 ) -> Intervals: 470 """Find all missing intervals between [start, end]. 471 472 Although the inputs are inclusive, the returned stored intervals are 473 [start_ts, end_ts) or start epoch timestamp inclusive and end epoch 474 timestamp exclusive. 475 476 Args: 477 start: The start date/time of the interval (inclusive) 478 end: The end date/time of the interval (inclusive) 479 480 Returns: 481 A list of all the missing intervals as epoch timestamps. 482 """ 483 if self.is_embedded_kind: 484 return [] 485 486 start_dt, end_dt = make_inclusive(start, self.model.cron_floor(end)) 487 488 if self.is_full_kind or self.is_view_kind or self.is_seed_kind: 489 latest_dt = to_datetime(self.model.cron_floor(latest or now())) 490 latest_ts = to_timestamp(latest_dt) 491 # if the latest ts is stored in the last interval, nothing is missing 492 # else returns the latest ts with the exclusive end ts. 493 if self.intervals and self.intervals[-1][1] >= latest_ts: 494 return [] 495 return [(to_timestamp(self.model.cron_prev(latest_dt)), latest_ts)] 496 497 missing = [] 498 dates = list(croniter_range(start_dt, end_dt, self.model.normalized_cron())) 499 size = len(dates) 500 501 for i in range(size): 502 current_ts = to_timestamp(dates[i]) 503 end_ts = ( 504 to_timestamp(dates[i + 1]) 505 if i + 1 < size 506 else to_timestamp(self.model.cron_next(current_ts)) 507 ) 508 509 for low, high in self.intervals: 510 if current_ts < low: 511 missing.append((current_ts, end_ts)) 512 break 513 elif current_ts < high: 514 break 515 else: 516 missing.append((current_ts, end_ts)) 517 518 return missing
Find all missing intervals between [start, end].
Although the inputs are inclusive, the returned stored intervals are [start_ts, end_ts) or start epoch timestamp inclusive and end epoch timestamp exclusive.
Arguments:
- start: The start date/time of the interval (inclusive)
- end: The end date/time of the interval (inclusive)
Returns:
A list of all the missing intervals as epoch timestamps.
520 def set_version( 521 self, 522 version: t.Optional[str | SnapshotDataVersion | SnapshotTableInfo | Snapshot] = None, 523 ) -> None: 524 """Set the version of this snapshot. 525 526 If no version is passed, the fingerprint of the snapshot will be used. 527 528 Args: 529 version: Either a string or a TableInfo to use. 530 """ 531 if isinstance(version, (SnapshotDataVersion, SnapshotTableInfo, Snapshot)): 532 self.version = version.data_version.version 533 else: 534 self.version = version or self.fingerprint.to_version()
Set the version of this snapshot.
If no version is passed, the fingerprint of the snapshot will be used.
Arguments:
- version: Either a string or a TableInfo to use.
536 def set_unpaused_ts(self, unpaused_dt: t.Optional[TimeLike]) -> None: 537 """Sets the timestamp for when this snapshot was unpaused. 538 539 Args: 540 unpaused_dt: The datetime object of when this snapshot was unpaused. 541 """ 542 self.unpaused_ts = to_timestamp(self.model.cron_floor(unpaused_dt)) if unpaused_dt else None
Sets the timestamp for when this snapshot was unpaused.
Arguments:
- unpaused_dt: The datetime object of when this snapshot was unpaused.
544 def table_name(self, is_dev: bool = False, for_read: bool = False) -> str: 545 """Full table name pointing to the materialized location of the snapshot. 546 547 Args: 548 is_dev: Whether the table name will be used in development mode. 549 for_read: Whether the table name will be used for reading by a different snapshot. 550 """ 551 self._ensure_version() 552 assert self.version 553 return self._table_name(self.version, is_dev, for_read)
Full table name pointing to the materialized location of the snapshot.
Arguments:
- is_dev: Whether the table name will be used in development mode.
- for_read: Whether the table name will be used for reading by a different snapshot.
555 def table_name_for_mapping(self, is_dev: bool = False) -> str: 556 """Full table name used by a child snapshot for table mapping during evaluation. 557 558 Args: 559 is_dev: Whether the table name will be used in development mode. 560 """ 561 self._ensure_version() 562 assert self.version 563 564 if is_dev and self.is_forward_only: 565 # If this snapshot is unpaused we shouldn't be using a temporary 566 # table for mapping purposes. 567 is_dev = self.is_paused 568 569 return self._table_name(self.version, is_dev, True)
Full table name used by a child snapshot for table mapping during evaluation.
Arguments:
- is_dev: Whether the table name will be used in development mode.
571 def version_get_or_generate(self) -> str: 572 """Helper method to get the version or generate it from the fingerprint.""" 573 return self.version or self.fingerprint.to_version()
Helper method to get the version or generate it from the fingerprint.
Inherited Members
- pydantic.main.BaseModel
- BaseModel
- parse_obj
- parse_raw
- parse_file
- from_orm
- construct
- copy
- schema
- schema_json
- validate
- update_forward_refs
657def fingerprint_from_model( 658 model: Model, 659 *, 660 models: t.Dict[str, Model], 661 physical_schema: str = "", 662 audits: t.Optional[t.Dict[str, Audit]] = None, 663 cache: t.Optional[t.Dict[str, SnapshotFingerprint]] = None, 664) -> SnapshotFingerprint: 665 """Helper function to generate a fingerprint based on a model's query and environment. 666 667 This method tries to remove non meaningful differences to avoid ever changing fingerprints. 668 The fingerprint is made up of two parts split by an underscore -- query_metadata. The query hash is 669 determined purely by the rendered query and the metadata by everything else. 670 671 Args: 672 model: Model to fingerprint. 673 physical_schema: The physical_schema of the snapshot which represents where it is stored. 674 models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. 675 If no dictionary is passed in the fingerprint will not be dependent on a model's parents. 676 audits: Available audits by name. 677 cache: Cache of model name to fingerprints. 678 679 Returns: 680 The fingerprint. 681 """ 682 cache = {} if cache is None else cache 683 684 if model.name not in cache: 685 parents = [ 686 fingerprint_from_model( 687 models[table], 688 models=models, 689 physical_schema=physical_schema, 690 audits=audits, 691 cache=cache, 692 ) 693 for table in model.depends_on 694 if table in models 695 ] 696 697 parent_data_hash = _hash(sorted(p.to_version() for p in parents)) 698 699 parent_metadata_hash = _hash( 700 sorted(h for p in parents for h in (p.metadata_hash, p.parent_metadata_hash)) 701 ) 702 703 cache[model.name] = SnapshotFingerprint( 704 data_hash=_model_data_hash(model, physical_schema), 705 metadata_hash=_model_metadata_hash(model, audits or {}), 706 parent_data_hash=parent_data_hash, 707 parent_metadata_hash=parent_metadata_hash, 708 ) 709 710 return cache[model.name]
Helper function to generate a fingerprint based on a model's query and environment.
This method tries to remove non meaningful differences to avoid ever changing fingerprints. The fingerprint is made up of two parts split by an underscore -- query_metadata. The query hash is determined purely by the rendered query and the metadata by everything else.
Arguments:
- model: Model to fingerprint.
- physical_schema: The physical_schema of the snapshot which represents where it is stored.
- models: Dictionary of all models in the graph to make the fingerprint dependent on parent changes. If no dictionary is passed in the fingerprint will not be dependent on a model's parents.
- audits: Available audits by name.
- cache: Cache of model name to fingerprints.
Returns:
The fingerprint.
827def merge_intervals(intervals: Intervals) -> Intervals: 828 """Merge a list of intervals. 829 830 Args: 831 intervals: A list of intervals to merge together. 832 833 Returns: 834 A new list of sorted and merged intervals. 835 """ 836 intervals = sorted(intervals) 837 838 merged = [intervals[0]] 839 840 for interval in intervals[1:]: 841 current = merged[-1] 842 843 if interval[0] <= current[1]: 844 merged[-1] = (current[0], max(current[1], interval[1])) 845 else: 846 merged.append(interval) 847 848 return merged
Merge a list of intervals.
Arguments:
- intervals: A list of intervals to merge together.
Returns:
A new list of sorted and merged intervals.
851def remove_interval(intervals: Intervals, remove_start: int, remove_end: int) -> Intervals: 852 """Remove an interval from a list of intervals. 853 854 Args: 855 intervals: A list of exclusive intervals. 856 remove_start: The inclusive start to remove. 857 remove_end: The exclusive end to remove. 858 859 Returns: 860 A new list of intervals. 861 """ 862 modified: Intervals = [] 863 864 for start, end in intervals: 865 if remove_start > start and remove_end < end: 866 modified.extend( 867 ( 868 (start, remove_start), 869 (remove_end, end), 870 ) 871 ) 872 elif remove_start > start: 873 modified.append((start, min(remove_start, end))) 874 elif remove_end < end: 875 modified.append((max(remove_end, start), end)) 876 877 return modified
Remove an interval from a list of intervals.
Arguments:
- intervals: A list of exclusive intervals.
- remove_start: The inclusive start to remove.
- remove_end: The exclusive end to remove.
Returns:
A new list of intervals.