sqlmesh.core.scheduler
1from __future__ import annotations 2 3import logging 4import typing as t 5from datetime import datetime 6 7from sqlmesh.core.console import Console, get_console 8from sqlmesh.core.snapshot import ( 9 Snapshot, 10 SnapshotEvaluator, 11 SnapshotId, 12 SnapshotIdLike, 13) 14from sqlmesh.core.state_sync import StateSync 15from sqlmesh.utils import format_exception 16from sqlmesh.utils.concurrency import concurrent_apply_to_dag 17from sqlmesh.utils.dag import DAG 18from sqlmesh.utils.date import ( 19 TimeLike, 20 now, 21 to_datetime, 22 validate_date_range, 23 yesterday, 24) 25 26logger = logging.getLogger(__name__) 27Interval = t.Tuple[datetime, datetime] 28Batch = t.List[Interval] 29SnapshotToBatches = t.Dict[Snapshot, Batch] 30SchedulingUnit = t.Tuple[Snapshot, Interval] 31 32 33class Scheduler: 34 """Schedules and manages the evaluation of snapshots. 35 36 The scheduler evaluates multiple snapshots with date intervals in the correct 37 topological order. It consults the state sync to understand what intervals for each 38 snapshot needs to be backfilled. 39 40 The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine. 41 42 Args: 43 snapshots: A collection of snapshots. 44 snapshot_evaluator: The snapshot evaluator to execute queries. 45 state_sync: The state sync to pull saved snapshots. 46 max_workers: The maximum number of parallel queries to run. 47 console: The rich instance used for printing scheduling information. 48 """ 49 50 def __init__( 51 self, 52 snapshots: t.Iterable[Snapshot], 53 snapshot_evaluator: SnapshotEvaluator, 54 state_sync: StateSync, 55 max_workers: int = 1, 56 console: t.Optional[Console] = None, 57 ): 58 self.snapshots = {s.snapshot_id: s for s in snapshots} 59 self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots) 60 self.snapshot_evaluator = snapshot_evaluator 61 self.state_sync = state_sync 62 self.max_workers = max_workers 63 self.console: Console = console or get_console() 64 65 def batches( 66 self, 67 start: t.Optional[TimeLike] = None, 68 end: t.Optional[TimeLike] = None, 69 latest: t.Optional[TimeLike] = None, 70 is_dev: bool = False, 71 ) -> SnapshotToBatches: 72 """Returns a list of snapshot batches to evaluate. 73 74 Args: 75 start: The start of the run. Defaults to the min model start date. 76 end: The end of the run. Defaults to now. 77 latest: The latest datetime to use for non-incremental queries. 78 is_dev: Indicates whether the evaluation happens in the development mode and temporary 79 tables / table clones should be used where applicable. 80 """ 81 validate_date_range(start, end) 82 83 return self._interval_params( 84 self.snapshot_per_version.values(), 85 start, 86 end, 87 latest, 88 is_dev=is_dev, 89 ) 90 91 def evaluate( 92 self, 93 snapshot: Snapshot, 94 start: TimeLike, 95 end: TimeLike, 96 latest: TimeLike, 97 is_dev: bool = False, 98 **kwargs: t.Any, 99 ) -> None: 100 """Evaluate a snapshot and add the processed interval to the state sync. 101 102 Args: 103 snapshot: Snapshot to evaluate. 104 start: The start datetime to render. 105 end: The end datetime to render. 106 latest: The latest datetime to use for non-incremental queries. 107 is_dev: Indicates whether the evaluation happens in the development mode and temporary 108 tables / table clones should be used where applicable. 109 kwargs: Additional kwargs to pass to the renderer. 110 """ 111 validate_date_range(start, end) 112 113 snapshots = { 114 **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents}, 115 snapshot.name: snapshot, 116 } 117 118 self.snapshot_evaluator.evaluate( 119 snapshot, 120 start, 121 end, 122 latest, 123 snapshots=snapshots, 124 is_dev=is_dev, 125 **kwargs, 126 ) 127 self.snapshot_evaluator.audit( 128 snapshot=snapshot, 129 start=start, 130 end=end, 131 latest=latest, 132 snapshots=snapshots, 133 is_dev=is_dev, 134 **kwargs, 135 ) 136 self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev) 137 self.console.update_snapshot_progress(snapshot.name, 1) 138 139 def run( 140 self, 141 start: t.Optional[TimeLike] = None, 142 end: t.Optional[TimeLike] = None, 143 latest: t.Optional[TimeLike] = None, 144 is_dev: bool = False, 145 ) -> bool: 146 """Concurrently runs all snapshots in topological order. 147 148 Args: 149 start: The start of the run. Defaults to the min model start date. 150 end: The end of the run. Defaults to now. 151 latest: The latest datetime to use for non-incremental queries. 152 is_dev: Indicates whether the evaluation happens in the development mode and temporary 153 tables / table clones should be used where applicable. 154 155 Returns: 156 True if the execution was successful and False otherwise. 157 """ 158 validate_date_range(start, end) 159 160 latest = latest or now() 161 batches = self.batches(start, end, latest, is_dev=is_dev) 162 dag = self._dag(batches) 163 164 visited = set() 165 for snapshot, _ in dag.sorted(): 166 if snapshot in visited: 167 continue 168 visited.add(snapshot) 169 intervals = batches[snapshot] 170 self.console.start_snapshot_progress(snapshot.name, len(intervals)) 171 172 def evaluate_node(node: SchedulingUnit) -> None: 173 assert latest 174 snapshot, (start, end) = node 175 self.evaluate(snapshot, start, end, latest, is_dev=is_dev) 176 177 with self.snapshot_evaluator.concurrent_context(): 178 errors, skipped_intervals = concurrent_apply_to_dag( 179 dag, 180 evaluate_node, 181 self.max_workers, 182 raise_on_error=False, 183 ) 184 185 self.console.stop_snapshot_progress(success=not errors) 186 187 for error in errors: 188 sid = error.node[0] 189 formatted_exception = "".join(format_exception(error.__cause__ or error)) 190 self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}") 191 192 skipped_snapshots = {i[0] for i in skipped_intervals} 193 for skipped in skipped_snapshots: 194 self.console.log_status_update(f"SKIPPED snapshot {skipped}\n") 195 196 return not errors 197 198 def _interval_params( 199 self, 200 snapshots: t.Iterable[Snapshot], 201 start: t.Optional[TimeLike] = None, 202 end: t.Optional[TimeLike] = None, 203 latest: t.Optional[TimeLike] = None, 204 is_dev: bool = False, 205 ) -> SnapshotToBatches: 206 """Find the optimal date interval paramaters based on what needs processing and maximal batch size. 207 208 For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, 209 calculate the missing intervals that need to be processed given the passed in start and end intervals. 210 211 If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than 212 or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. 213 For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs 214 with 30 days and 1 job with 10. 215 216 Args: 217 snapshots: The list of snapshots. 218 start: Start of the interval. 219 end: End of the interval. 220 latest: The latest datetime to use for non-incremental queries. 221 is_dev: Indicates whether the evaluation happens in the development mode. 222 223 Returns: 224 A list of tuples containing all snapshots needing to be run with their associated interval params. 225 """ 226 all_snapshots = {s.snapshot_id: s for s in self.snapshots.values()} 227 228 # When in development mode only consider intervals of the current forward-only snapshot and ignore 229 # intervals of all snapshots with the same version that came before it. 230 same_version_snapshots = ( 231 [s for s in snapshots if not s.is_forward_only or not s.is_paused] 232 if is_dev 233 else snapshots 234 ) 235 stored_snapshots = self.state_sync.get_snapshots_with_same_version(same_version_snapshots) 236 all_snapshots.update({s.snapshot_id: s for s in stored_snapshots}) 237 238 return compute_interval_params( 239 snapshots, 240 snapshots=all_snapshots, 241 start=start or earliest_start_date(snapshots), 242 end=end or now(), 243 latest=latest or now(), 244 ) 245 246 def _dag(self, batches: SnapshotToBatches) -> DAG[SchedulingUnit]: 247 """Builds a DAG of snapshot intervals to be evaluated. 248 249 Args: 250 batches: The batches of snapshots and intervals to evaluate. 251 252 Returns: 253 A DAG of snapshot intervals to be evaluated. 254 """ 255 256 intervals_per_snapshot_version = { 257 (snapshot.name, snapshot.version_get_or_generate()): intervals 258 for snapshot, intervals in batches.items() 259 } 260 261 dag = DAG[SchedulingUnit]() 262 for snapshot, intervals in batches.items(): 263 if not intervals: 264 continue 265 upstream_dependencies = [ 266 (self.snapshots[p_sid], interval) 267 for p_sid in snapshot.parents 268 if p_sid in self.snapshots 269 for interval in intervals_per_snapshot_version.get( 270 ( 271 self.snapshots[p_sid].name, 272 self.snapshots[p_sid].version_get_or_generate(), 273 ), 274 [], 275 ) 276 ] 277 for i, interval in enumerate(intervals): 278 dag.add((snapshot, interval), upstream_dependencies) 279 if snapshot.is_incremental_by_unique_key_kind: 280 dag.add( 281 (snapshot, interval), 282 [(snapshot, _interval) for _interval in intervals[:i]], 283 ) 284 285 return dag 286 287 288def compute_interval_params( 289 target: t.Iterable[SnapshotIdLike], 290 *, 291 snapshots: t.Dict[SnapshotId, Snapshot], 292 start: TimeLike, 293 end: TimeLike, 294 latest: TimeLike, 295) -> SnapshotToBatches: 296 """Find the optimal date interval paramaters based on what needs processing and maximal batch size. 297 298 For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, 299 calculate the missing intervals that need to be processed given the passed in start and end intervals. 300 301 If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than 302 or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. 303 For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs 304 with 30 days and 1 job with 10. 305 306 Args: 307 target: A set of target snapshots for which intervals should be computed. 308 snapshots: A catalog of all available snapshots (including the target ones). 309 start: Start of the interval. 310 end: End of the interval. 311 latest: The latest datetime to use for non-incremental queries. 312 313 Returns: 314 A dict containing all snapshots needing to be run with their associated interval params. 315 """ 316 start_dt = to_datetime(start) 317 318 snapshots_to_batches = {} 319 320 for snapshot in Snapshot.merge_snapshots(target, snapshots): 321 model_start_dt = max(start_date(snapshot, snapshots.values()) or start_dt, start_dt) 322 snapshots_to_batches[snapshot] = [ 323 (to_datetime(s), to_datetime(e)) 324 for s, e in snapshot.missing_intervals(model_start_dt, end, latest) 325 ] 326 327 return _batched_intervals(snapshots_to_batches) 328 329 330def start_date( 331 snapshot: Snapshot, snapshots: t.Dict[SnapshotId, Snapshot] | t.Iterable[Snapshot] 332) -> t.Optional[datetime]: 333 """Get the effective/inferred start date for a snapshot. 334 335 Not all snapshots define a start date. In those cases, the model's start date 336 can be inferred from its parent's start date. 337 338 Args: 339 snapshot: snapshot to infer start date. 340 snapshots: a catalog of available snapshots. 341 342 Returns: 343 Start datetime object. 344 """ 345 if snapshot.model.start: 346 return to_datetime(snapshot.model.start) 347 348 if not isinstance(snapshots, dict): 349 snapshots = {snapshot.snapshot_id: snapshot for snapshot in snapshots} 350 351 earliest = None 352 353 for parent in snapshot.parents: 354 if parent not in snapshots: 355 continue 356 357 start_dt = start_date(snapshots[parent], snapshots) 358 359 if not earliest: 360 earliest = start_dt 361 elif start_dt: 362 earliest = min(earliest, start_dt) 363 364 return earliest 365 366 367def earliest_start_date(snapshots: t.Iterable[Snapshot]) -> datetime: 368 """Get the earliest start date from a collection of snapshots. 369 370 Args: 371 snapshots: Snapshots to find earliest start date. 372 Returns: 373 The earliest start date or yesterday if none is found.""" 374 snapshots = list(snapshots) 375 if snapshots: 376 return min(start_date(snapshot, snapshots) or yesterday() for snapshot in snapshots) 377 return yesterday() 378 379 380def _batched_intervals(params: SnapshotToBatches) -> SnapshotToBatches: 381 batches = {} 382 383 for snapshot, intervals in params.items(): 384 batch_size = snapshot.model.batch_size 385 batches_for_snapshot = [] 386 next_batch: t.List[Interval] = [] 387 for interval in intervals: 388 if (batch_size and len(next_batch) >= batch_size) or ( 389 next_batch and interval[0] != next_batch[-1][-1] 390 ): 391 batches_for_snapshot.append((next_batch[0][0], next_batch[-1][-1])) 392 next_batch = [] 393 next_batch.append(interval) 394 if next_batch: 395 batches_for_snapshot.append((next_batch[0][0], next_batch[-1][-1])) 396 batches[snapshot] = batches_for_snapshot 397 398 return batches 399 400 401def _resolve_one_snapshot_per_version( 402 snapshots: t.Iterable[Snapshot], 403) -> t.Dict[t.Tuple[str, str], Snapshot]: 404 snapshot_per_version: t.Dict[t.Tuple[str, str], Snapshot] = {} 405 for snapshot in snapshots: 406 key = (snapshot.name, snapshot.version_get_or_generate()) 407 if key not in snapshot_per_version: 408 snapshot_per_version[key] = snapshot 409 else: 410 prev_snapshot = snapshot_per_version[key] 411 if snapshot.unpaused_ts and ( 412 not prev_snapshot.unpaused_ts or snapshot.created_ts > prev_snapshot.created_ts 413 ): 414 snapshot_per_version[key] = snapshot 415 416 return snapshot_per_version
34class Scheduler: 35 """Schedules and manages the evaluation of snapshots. 36 37 The scheduler evaluates multiple snapshots with date intervals in the correct 38 topological order. It consults the state sync to understand what intervals for each 39 snapshot needs to be backfilled. 40 41 The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine. 42 43 Args: 44 snapshots: A collection of snapshots. 45 snapshot_evaluator: The snapshot evaluator to execute queries. 46 state_sync: The state sync to pull saved snapshots. 47 max_workers: The maximum number of parallel queries to run. 48 console: The rich instance used for printing scheduling information. 49 """ 50 51 def __init__( 52 self, 53 snapshots: t.Iterable[Snapshot], 54 snapshot_evaluator: SnapshotEvaluator, 55 state_sync: StateSync, 56 max_workers: int = 1, 57 console: t.Optional[Console] = None, 58 ): 59 self.snapshots = {s.snapshot_id: s for s in snapshots} 60 self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots) 61 self.snapshot_evaluator = snapshot_evaluator 62 self.state_sync = state_sync 63 self.max_workers = max_workers 64 self.console: Console = console or get_console() 65 66 def batches( 67 self, 68 start: t.Optional[TimeLike] = None, 69 end: t.Optional[TimeLike] = None, 70 latest: t.Optional[TimeLike] = None, 71 is_dev: bool = False, 72 ) -> SnapshotToBatches: 73 """Returns a list of snapshot batches to evaluate. 74 75 Args: 76 start: The start of the run. Defaults to the min model start date. 77 end: The end of the run. Defaults to now. 78 latest: The latest datetime to use for non-incremental queries. 79 is_dev: Indicates whether the evaluation happens in the development mode and temporary 80 tables / table clones should be used where applicable. 81 """ 82 validate_date_range(start, end) 83 84 return self._interval_params( 85 self.snapshot_per_version.values(), 86 start, 87 end, 88 latest, 89 is_dev=is_dev, 90 ) 91 92 def evaluate( 93 self, 94 snapshot: Snapshot, 95 start: TimeLike, 96 end: TimeLike, 97 latest: TimeLike, 98 is_dev: bool = False, 99 **kwargs: t.Any, 100 ) -> None: 101 """Evaluate a snapshot and add the processed interval to the state sync. 102 103 Args: 104 snapshot: Snapshot to evaluate. 105 start: The start datetime to render. 106 end: The end datetime to render. 107 latest: The latest datetime to use for non-incremental queries. 108 is_dev: Indicates whether the evaluation happens in the development mode and temporary 109 tables / table clones should be used where applicable. 110 kwargs: Additional kwargs to pass to the renderer. 111 """ 112 validate_date_range(start, end) 113 114 snapshots = { 115 **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents}, 116 snapshot.name: snapshot, 117 } 118 119 self.snapshot_evaluator.evaluate( 120 snapshot, 121 start, 122 end, 123 latest, 124 snapshots=snapshots, 125 is_dev=is_dev, 126 **kwargs, 127 ) 128 self.snapshot_evaluator.audit( 129 snapshot=snapshot, 130 start=start, 131 end=end, 132 latest=latest, 133 snapshots=snapshots, 134 is_dev=is_dev, 135 **kwargs, 136 ) 137 self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev) 138 self.console.update_snapshot_progress(snapshot.name, 1) 139 140 def run( 141 self, 142 start: t.Optional[TimeLike] = None, 143 end: t.Optional[TimeLike] = None, 144 latest: t.Optional[TimeLike] = None, 145 is_dev: bool = False, 146 ) -> bool: 147 """Concurrently runs all snapshots in topological order. 148 149 Args: 150 start: The start of the run. Defaults to the min model start date. 151 end: The end of the run. Defaults to now. 152 latest: The latest datetime to use for non-incremental queries. 153 is_dev: Indicates whether the evaluation happens in the development mode and temporary 154 tables / table clones should be used where applicable. 155 156 Returns: 157 True if the execution was successful and False otherwise. 158 """ 159 validate_date_range(start, end) 160 161 latest = latest or now() 162 batches = self.batches(start, end, latest, is_dev=is_dev) 163 dag = self._dag(batches) 164 165 visited = set() 166 for snapshot, _ in dag.sorted(): 167 if snapshot in visited: 168 continue 169 visited.add(snapshot) 170 intervals = batches[snapshot] 171 self.console.start_snapshot_progress(snapshot.name, len(intervals)) 172 173 def evaluate_node(node: SchedulingUnit) -> None: 174 assert latest 175 snapshot, (start, end) = node 176 self.evaluate(snapshot, start, end, latest, is_dev=is_dev) 177 178 with self.snapshot_evaluator.concurrent_context(): 179 errors, skipped_intervals = concurrent_apply_to_dag( 180 dag, 181 evaluate_node, 182 self.max_workers, 183 raise_on_error=False, 184 ) 185 186 self.console.stop_snapshot_progress(success=not errors) 187 188 for error in errors: 189 sid = error.node[0] 190 formatted_exception = "".join(format_exception(error.__cause__ or error)) 191 self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}") 192 193 skipped_snapshots = {i[0] for i in skipped_intervals} 194 for skipped in skipped_snapshots: 195 self.console.log_status_update(f"SKIPPED snapshot {skipped}\n") 196 197 return not errors 198 199 def _interval_params( 200 self, 201 snapshots: t.Iterable[Snapshot], 202 start: t.Optional[TimeLike] = None, 203 end: t.Optional[TimeLike] = None, 204 latest: t.Optional[TimeLike] = None, 205 is_dev: bool = False, 206 ) -> SnapshotToBatches: 207 """Find the optimal date interval paramaters based on what needs processing and maximal batch size. 208 209 For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, 210 calculate the missing intervals that need to be processed given the passed in start and end intervals. 211 212 If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than 213 or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. 214 For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs 215 with 30 days and 1 job with 10. 216 217 Args: 218 snapshots: The list of snapshots. 219 start: Start of the interval. 220 end: End of the interval. 221 latest: The latest datetime to use for non-incremental queries. 222 is_dev: Indicates whether the evaluation happens in the development mode. 223 224 Returns: 225 A list of tuples containing all snapshots needing to be run with their associated interval params. 226 """ 227 all_snapshots = {s.snapshot_id: s for s in self.snapshots.values()} 228 229 # When in development mode only consider intervals of the current forward-only snapshot and ignore 230 # intervals of all snapshots with the same version that came before it. 231 same_version_snapshots = ( 232 [s for s in snapshots if not s.is_forward_only or not s.is_paused] 233 if is_dev 234 else snapshots 235 ) 236 stored_snapshots = self.state_sync.get_snapshots_with_same_version(same_version_snapshots) 237 all_snapshots.update({s.snapshot_id: s for s in stored_snapshots}) 238 239 return compute_interval_params( 240 snapshots, 241 snapshots=all_snapshots, 242 start=start or earliest_start_date(snapshots), 243 end=end or now(), 244 latest=latest or now(), 245 ) 246 247 def _dag(self, batches: SnapshotToBatches) -> DAG[SchedulingUnit]: 248 """Builds a DAG of snapshot intervals to be evaluated. 249 250 Args: 251 batches: The batches of snapshots and intervals to evaluate. 252 253 Returns: 254 A DAG of snapshot intervals to be evaluated. 255 """ 256 257 intervals_per_snapshot_version = { 258 (snapshot.name, snapshot.version_get_or_generate()): intervals 259 for snapshot, intervals in batches.items() 260 } 261 262 dag = DAG[SchedulingUnit]() 263 for snapshot, intervals in batches.items(): 264 if not intervals: 265 continue 266 upstream_dependencies = [ 267 (self.snapshots[p_sid], interval) 268 for p_sid in snapshot.parents 269 if p_sid in self.snapshots 270 for interval in intervals_per_snapshot_version.get( 271 ( 272 self.snapshots[p_sid].name, 273 self.snapshots[p_sid].version_get_or_generate(), 274 ), 275 [], 276 ) 277 ] 278 for i, interval in enumerate(intervals): 279 dag.add((snapshot, interval), upstream_dependencies) 280 if snapshot.is_incremental_by_unique_key_kind: 281 dag.add( 282 (snapshot, interval), 283 [(snapshot, _interval) for _interval in intervals[:i]], 284 ) 285 286 return dag
Schedules and manages the evaluation of snapshots.
The scheduler evaluates multiple snapshots with date intervals in the correct topological order. It consults the state sync to understand what intervals for each snapshot needs to be backfilled.
The scheduler comes equipped with a simple ThreadPoolExecutor based evaluation engine.
Arguments:
- snapshots: A collection of snapshots.
- snapshot_evaluator: The snapshot evaluator to execute queries.
- state_sync: The state sync to pull saved snapshots.
- max_workers: The maximum number of parallel queries to run.
- console: The rich instance used for printing scheduling information.
51 def __init__( 52 self, 53 snapshots: t.Iterable[Snapshot], 54 snapshot_evaluator: SnapshotEvaluator, 55 state_sync: StateSync, 56 max_workers: int = 1, 57 console: t.Optional[Console] = None, 58 ): 59 self.snapshots = {s.snapshot_id: s for s in snapshots} 60 self.snapshot_per_version = _resolve_one_snapshot_per_version(snapshots) 61 self.snapshot_evaluator = snapshot_evaluator 62 self.state_sync = state_sync 63 self.max_workers = max_workers 64 self.console: Console = console or get_console()
66 def batches( 67 self, 68 start: t.Optional[TimeLike] = None, 69 end: t.Optional[TimeLike] = None, 70 latest: t.Optional[TimeLike] = None, 71 is_dev: bool = False, 72 ) -> SnapshotToBatches: 73 """Returns a list of snapshot batches to evaluate. 74 75 Args: 76 start: The start of the run. Defaults to the min model start date. 77 end: The end of the run. Defaults to now. 78 latest: The latest datetime to use for non-incremental queries. 79 is_dev: Indicates whether the evaluation happens in the development mode and temporary 80 tables / table clones should be used where applicable. 81 """ 82 validate_date_range(start, end) 83 84 return self._interval_params( 85 self.snapshot_per_version.values(), 86 start, 87 end, 88 latest, 89 is_dev=is_dev, 90 )
Returns a list of snapshot batches to evaluate.
Arguments:
- start: The start of the run. Defaults to the min model start date.
- end: The end of the run. Defaults to now.
- latest: The latest datetime to use for non-incremental queries.
- is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
92 def evaluate( 93 self, 94 snapshot: Snapshot, 95 start: TimeLike, 96 end: TimeLike, 97 latest: TimeLike, 98 is_dev: bool = False, 99 **kwargs: t.Any, 100 ) -> None: 101 """Evaluate a snapshot and add the processed interval to the state sync. 102 103 Args: 104 snapshot: Snapshot to evaluate. 105 start: The start datetime to render. 106 end: The end datetime to render. 107 latest: The latest datetime to use for non-incremental queries. 108 is_dev: Indicates whether the evaluation happens in the development mode and temporary 109 tables / table clones should be used where applicable. 110 kwargs: Additional kwargs to pass to the renderer. 111 """ 112 validate_date_range(start, end) 113 114 snapshots = { 115 **{p_sid.name: self.snapshots[p_sid] for p_sid in snapshot.parents}, 116 snapshot.name: snapshot, 117 } 118 119 self.snapshot_evaluator.evaluate( 120 snapshot, 121 start, 122 end, 123 latest, 124 snapshots=snapshots, 125 is_dev=is_dev, 126 **kwargs, 127 ) 128 self.snapshot_evaluator.audit( 129 snapshot=snapshot, 130 start=start, 131 end=end, 132 latest=latest, 133 snapshots=snapshots, 134 is_dev=is_dev, 135 **kwargs, 136 ) 137 self.state_sync.add_interval(snapshot.snapshot_id, start, end, is_dev=is_dev) 138 self.console.update_snapshot_progress(snapshot.name, 1)
Evaluate a snapshot and add the processed interval to the state sync.
Arguments:
- snapshot: Snapshot to evaluate.
- start: The start datetime to render.
- end: The end datetime to render.
- latest: The latest datetime to use for non-incremental queries.
- is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
- kwargs: Additional kwargs to pass to the renderer.
140 def run( 141 self, 142 start: t.Optional[TimeLike] = None, 143 end: t.Optional[TimeLike] = None, 144 latest: t.Optional[TimeLike] = None, 145 is_dev: bool = False, 146 ) -> bool: 147 """Concurrently runs all snapshots in topological order. 148 149 Args: 150 start: The start of the run. Defaults to the min model start date. 151 end: The end of the run. Defaults to now. 152 latest: The latest datetime to use for non-incremental queries. 153 is_dev: Indicates whether the evaluation happens in the development mode and temporary 154 tables / table clones should be used where applicable. 155 156 Returns: 157 True if the execution was successful and False otherwise. 158 """ 159 validate_date_range(start, end) 160 161 latest = latest or now() 162 batches = self.batches(start, end, latest, is_dev=is_dev) 163 dag = self._dag(batches) 164 165 visited = set() 166 for snapshot, _ in dag.sorted(): 167 if snapshot in visited: 168 continue 169 visited.add(snapshot) 170 intervals = batches[snapshot] 171 self.console.start_snapshot_progress(snapshot.name, len(intervals)) 172 173 def evaluate_node(node: SchedulingUnit) -> None: 174 assert latest 175 snapshot, (start, end) = node 176 self.evaluate(snapshot, start, end, latest, is_dev=is_dev) 177 178 with self.snapshot_evaluator.concurrent_context(): 179 errors, skipped_intervals = concurrent_apply_to_dag( 180 dag, 181 evaluate_node, 182 self.max_workers, 183 raise_on_error=False, 184 ) 185 186 self.console.stop_snapshot_progress(success=not errors) 187 188 for error in errors: 189 sid = error.node[0] 190 formatted_exception = "".join(format_exception(error.__cause__ or error)) 191 self.console.log_error(f"FAILED processing snapshot {sid}\n{formatted_exception}") 192 193 skipped_snapshots = {i[0] for i in skipped_intervals} 194 for skipped in skipped_snapshots: 195 self.console.log_status_update(f"SKIPPED snapshot {skipped}\n") 196 197 return not errors
Concurrently runs all snapshots in topological order.
Arguments:
- start: The start of the run. Defaults to the min model start date.
- end: The end of the run. Defaults to now.
- latest: The latest datetime to use for non-incremental queries.
- is_dev: Indicates whether the evaluation happens in the development mode and temporary tables / table clones should be used where applicable.
Returns:
True if the execution was successful and False otherwise.
289def compute_interval_params( 290 target: t.Iterable[SnapshotIdLike], 291 *, 292 snapshots: t.Dict[SnapshotId, Snapshot], 293 start: TimeLike, 294 end: TimeLike, 295 latest: TimeLike, 296) -> SnapshotToBatches: 297 """Find the optimal date interval paramaters based on what needs processing and maximal batch size. 298 299 For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, 300 calculate the missing intervals that need to be processed given the passed in start and end intervals. 301 302 If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than 303 or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. 304 For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs 305 with 30 days and 1 job with 10. 306 307 Args: 308 target: A set of target snapshots for which intervals should be computed. 309 snapshots: A catalog of all available snapshots (including the target ones). 310 start: Start of the interval. 311 end: End of the interval. 312 latest: The latest datetime to use for non-incremental queries. 313 314 Returns: 315 A dict containing all snapshots needing to be run with their associated interval params. 316 """ 317 start_dt = to_datetime(start) 318 319 snapshots_to_batches = {} 320 321 for snapshot in Snapshot.merge_snapshots(target, snapshots): 322 model_start_dt = max(start_date(snapshot, snapshots.values()) or start_dt, start_dt) 323 snapshots_to_batches[snapshot] = [ 324 (to_datetime(s), to_datetime(e)) 325 for s, e in snapshot.missing_intervals(model_start_dt, end, latest) 326 ] 327 328 return _batched_intervals(snapshots_to_batches)
Find the optimal date interval paramaters based on what needs processing and maximal batch size.
For each model name, find all dependencies and look for a stored snapshot from the metastore. If a snapshot is found, calculate the missing intervals that need to be processed given the passed in start and end intervals.
If a snapshot's model specifies a batch size, consecutive intervals are merged into batches of a size that is less than or equal to the configured one. If no batch size is specified, then it uses the intervals that correspond to the model's cron expression. For example, if a model is supposed to run daily and has 70 days to backfill with a batch size set to 30, there would be 2 jobs with 30 days and 1 job with 10.
Arguments:
- target: A set of target snapshots for which intervals should be computed.
- snapshots: A catalog of all available snapshots (including the target ones).
- start: Start of the interval.
- end: End of the interval.
- latest: The latest datetime to use for non-incremental queries.
Returns:
A dict containing all snapshots needing to be run with their associated interval params.
331def start_date( 332 snapshot: Snapshot, snapshots: t.Dict[SnapshotId, Snapshot] | t.Iterable[Snapshot] 333) -> t.Optional[datetime]: 334 """Get the effective/inferred start date for a snapshot. 335 336 Not all snapshots define a start date. In those cases, the model's start date 337 can be inferred from its parent's start date. 338 339 Args: 340 snapshot: snapshot to infer start date. 341 snapshots: a catalog of available snapshots. 342 343 Returns: 344 Start datetime object. 345 """ 346 if snapshot.model.start: 347 return to_datetime(snapshot.model.start) 348 349 if not isinstance(snapshots, dict): 350 snapshots = {snapshot.snapshot_id: snapshot for snapshot in snapshots} 351 352 earliest = None 353 354 for parent in snapshot.parents: 355 if parent not in snapshots: 356 continue 357 358 start_dt = start_date(snapshots[parent], snapshots) 359 360 if not earliest: 361 earliest = start_dt 362 elif start_dt: 363 earliest = min(earliest, start_dt) 364 365 return earliest
Get the effective/inferred start date for a snapshot.
Not all snapshots define a start date. In those cases, the model's start date can be inferred from its parent's start date.
Arguments:
- snapshot: snapshot to infer start date.
- snapshots: a catalog of available snapshots.
Returns:
Start datetime object.
368def earliest_start_date(snapshots: t.Iterable[Snapshot]) -> datetime: 369 """Get the earliest start date from a collection of snapshots. 370 371 Args: 372 snapshots: Snapshots to find earliest start date. 373 Returns: 374 The earliest start date or yesterday if none is found.""" 375 snapshots = list(snapshots) 376 if snapshots: 377 return min(start_date(snapshot, snapshots) or yesterday() for snapshot in snapshots) 378 return yesterday()
Get the earliest start date from a collection of snapshots.
Arguments:
- snapshots: Snapshots to find earliest start date.
Returns:
The earliest start date or yesterday if none is found.