Edit on GitHub

sqlmesh.core.model.definition

   1from __future__ import annotations
   2
   3import ast
   4import sys
   5import types
   6import typing as t
   7from difflib import unified_diff
   8from itertools import zip_longest
   9from pathlib import Path
  10
  11import numpy as np
  12import pandas as pd
  13from astor import to_source
  14from pandas.core.dtypes.common import is_numeric_dtype
  15from pydantic import Field
  16from sqlglot import exp
  17from sqlglot.diff import ChangeDistiller, Insert, Keep
  18from sqlglot.optimizer.scope import traverse_scope
  19from sqlglot.schema import MappingSchema
  20from sqlglot.time import format_time
  21
  22from sqlmesh.core import constants as c
  23from sqlmesh.core import dialect as d
  24from sqlmesh.core.engine_adapter import PySparkDataFrame
  25from sqlmesh.core.hooks import HookRegistry, hook
  26from sqlmesh.core.macros import MacroEvaluator, MacroRegistry, macro
  27from sqlmesh.core.model.common import expression_validator, parse_model_name
  28from sqlmesh.core.model.kind import SeedKind
  29from sqlmesh.core.model.meta import HookCall, ModelMeta
  30from sqlmesh.core.model.seed import Seed, create_seed
  31from sqlmesh.core.renderer import ExpressionRenderer, QueryRenderer
  32from sqlmesh.utils.date import TimeLike, make_inclusive, to_datetime
  33from sqlmesh.utils.errors import ConfigError, SQLMeshError, raise_config_error
  34from sqlmesh.utils.jinja import JinjaMacroRegistry
  35from sqlmesh.utils.metaprogramming import (
  36    Executable,
  37    build_env,
  38    prepare_env,
  39    print_exception,
  40    serialize_env,
  41)
  42from sqlmesh.utils.pandas import filter_df_by_timelike
  43
  44if t.TYPE_CHECKING:
  45    from sqlmesh.core.audit import Audit
  46    from sqlmesh.core.context import ExecutionContext
  47    from sqlmesh.core.engine_adapter import EngineAdapter
  48    from sqlmesh.core.engine_adapter._typing import DF, QueryOrDF
  49    from sqlmesh.core.snapshot import Snapshot
  50
  51if sys.version_info >= (3, 9):
  52    from typing import Annotated, Literal
  53else:
  54    from typing_extensions import Annotated, Literal
  55
  56
  57class _Model(ModelMeta, frozen=True):
  58    """Model is the core abstraction for user defined datasets.
  59
  60    A model consists of logic that fetches the data (a SQL query, a Python script or a seed) and metadata
  61    associated with it. Models can be run on arbitrary cadences and support incremental or full refreshes.
  62    Models can also be materialized into physical tables or shared across other models as temporary views.
  63
  64    Example:
  65        MODEL (
  66            name           sushi.order_items,
  67            owner          jen,
  68            cron           '@daily',
  69            batch_size     30,
  70            start          '2020-01-01',
  71            partitioned_by ds
  72        );
  73
  74        @DEF(var, 'my_var');
  75
  76        SELECT
  77          1 AS column_a # my first column,
  78          @var AS my_column #my second column,
  79        ;
  80
  81    Args:
  82        name: The name of the model, which is of the form [catalog].[db].table.
  83            The catalog and db are optional.
  84        dialect: The SQL dialect that the model's query is written in. By default,
  85            this is assumed to be the dialect of the context.
  86        owner: The owner of the model.
  87        cron: A cron string specifying how often the model should be refreshed, leveraging the
  88            [croniter](https://github.com/kiorky/croniter) library.
  89        description: The optional model description.
  90        stamp: An optional arbitrary string sequence used to create new model versions without making
  91            changes to any of the functional components of the definition.
  92        start: The earliest date that the model will be backfilled for. If this is None,
  93            then the date is inferred by taking the most recent start date of its ancestors.
  94            The start date can be a static datetime or a relative datetime like "1 year ago"
  95        batch_size: The maximum number of intervals that can be run per backfill job. If this is None,
  96            then backfilling this model will do all of history in one job. If this is set, a model's backfill
  97            will be chunked such that each individual job will only contain jobs with max `batch_size` intervals.
  98        storage_format: The storage format used to store the physical table, only applicable in certain engines.
  99            (eg. 'parquet')
 100        partitioned_by: The partition columns, only applicable in certain engines. (eg. (ds, hour))
 101        pre: Pre-hooks to run before the model executes.
 102        post: Post-hooks to run after the model executes.
 103        expressions: All of the expressions between the model definition and final query, used for setting certain variables or environments.
 104        python_env: Dictionary containing all global variables needed to render the model's macros.
 105    """
 106
 107    expressions_: t.Optional[t.List[exp.Expression]] = Field(default=None, alias="expressions")
 108    python_env_: t.Optional[t.Dict[str, Executable]] = Field(default=None, alias="python_env")
 109    jinja_macros: JinjaMacroRegistry = JinjaMacroRegistry()
 110
 111    _path: Path = Path()
 112    _depends_on: t.Optional[t.Set[str]] = None
 113    _column_descriptions: t.Optional[t.Dict[str, str]] = None
 114
 115    _expressions_validator = expression_validator
 116
 117    def render(
 118        self,
 119        context: ExecutionContext,
 120        *,
 121        start: t.Optional[TimeLike] = None,
 122        end: t.Optional[TimeLike] = None,
 123        latest: t.Optional[TimeLike] = None,
 124        engine_adapter: t.Optional[EngineAdapter] = None,
 125        **kwargs: t.Any,
 126    ) -> t.Generator[QueryOrDF, None, None]:
 127        """Renders the content of this model in a form of either a SELECT query, executing which the data for this model can
 128        be fetched, or a dataframe object which contains the data itself.
 129
 130        The type of the returned object (query or dataframe) depends on whether the model was sourced from a SQL query,
 131        a Python script or a pre-built dataset (seed).
 132
 133        Args:
 134            context: The execution context used for fetching data.
 135            start: The start date/time of the run.
 136            end: The end date/time of the run.
 137            latest: The latest date/time to use for the run.
 138
 139        Returns:
 140            A generator which yields eiether a query object or one of the supported dataframe objects.
 141        """
 142        yield self.render_query(
 143            start=start,
 144            end=end,
 145            latest=latest,
 146            snapshots=context.snapshots,
 147            is_dev=context.is_dev,
 148            engine_adapter=engine_adapter,
 149            **kwargs,
 150        )
 151
 152    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
 153        """Returns the original list of sql expressions comprising the model definition.
 154
 155        Args:
 156            include_python: Whether or not to include Python code in the rendered definition.
 157        """
 158        expressions = []
 159        comment = None
 160        for field in ModelMeta.__fields__.values():
 161            field_value = getattr(self, field.name)
 162
 163            if field_value != field.default:
 164                if field.name == "description":
 165                    comment = field_value
 166                elif field.name == "kind":
 167                    expressions.append(
 168                        exp.Property(
 169                            this="kind",
 170                            value=field_value.to_expression(dialect=self.dialect),
 171                        )
 172                    )
 173                else:
 174                    expressions.append(
 175                        exp.Property(
 176                            this=field.alias or field.name,
 177                            value=META_FIELD_CONVERTER.get(field.name, exp.to_identifier)(
 178                                field_value
 179                            ),
 180                        )
 181                    )
 182
 183        model = d.Model(expressions=expressions)
 184        model.comments = [comment] if comment else None
 185
 186        python_expressions = []
 187        if include_python:
 188            python_env = d.PythonCode(
 189                expressions=[
 190                    v.payload if v.is_import or v.is_definition else f"{k} = {v.payload}"
 191                    for k, v in self.sorted_python_env
 192                ]
 193            )
 194            if python_env.expressions:
 195                python_expressions.append(python_env)
 196
 197        return [
 198            model,
 199            *self.expressions,
 200            *python_expressions,
 201        ]
 202
 203    def render_query(
 204        self,
 205        *,
 206        start: t.Optional[TimeLike] = None,
 207        end: t.Optional[TimeLike] = None,
 208        latest: t.Optional[TimeLike] = None,
 209        snapshots: t.Optional[t.Dict[str, Snapshot]] = None,
 210        expand: t.Iterable[str] = tuple(),
 211        is_dev: bool = False,
 212        engine_adapter: t.Optional[EngineAdapter] = None,
 213        **kwargs: t.Any,
 214    ) -> exp.Subqueryable:
 215        """Renders a model's query, expanding macros with provided kwargs, and optionally expanding referenced models.
 216
 217        Args:
 218            start: The start datetime to render. Defaults to epoch start.
 219            end: The end datetime to render. Defaults to epoch start.
 220            latest: The latest datetime to use for non-incremental queries. Defaults to epoch start.
 221            snapshots: All upstream snapshots (by model name) to use for expansion and mapping of physical locations.
 222            expand: Expand referenced models as subqueries. This is used to bypass backfills when running queries
 223                that depend on materialized tables.  Model definitions are inlined and can thus be run end to
 224                end on the fly.
 225            audit_name: The name of audit if the query to render is for an audit.
 226            is_dev: Indicates whether the rendering happens in the development mode and temporary
 227                tables / table clones should be used where applicable.
 228            kwargs: Additional kwargs to pass to the renderer.
 229
 230        Returns:
 231            The rendered expression.
 232        """
 233        return exp.select(
 234            *(
 235                exp.alias_(f"NULL::{column_type}", name)
 236                for name, column_type in self.columns_to_types.items()
 237            )
 238        ).from_(exp.values([tuple([1])], alias="t", columns=["dummy"]))
 239
 240    def ctas_query(
 241        self, snapshots: t.Dict[str, Snapshot], is_dev: bool = False
 242    ) -> exp.Subqueryable:
 243        """Return a dummy query to do a CTAS.
 244
 245        If a model's column types are unknown, the only way to create the table is to
 246        run the fully expanded query. This can be expensive so we add a WHERE FALSE to all
 247        SELECTS and hopefully the optimizer is smart enough to not do anything.
 248
 249        Args:
 250            snapshots: All upstream snapshots (by model name) to use for expansion and mapping of physical locations.
 251            is_dev: Indicates whether the creation happens in the development mode and temporary
 252                tables / table clones should be used where applicable.
 253        Return:
 254            The mocked out ctas query.
 255        """
 256        query = self.render_query(snapshots=snapshots, is_dev=is_dev)
 257        # the query is expanded so it's been copied, it's safe to mutate.
 258        for select in query.find_all(exp.Select):
 259            select.where("FALSE", copy=False)
 260
 261        return query
 262
 263    def run_pre_hooks(
 264        self,
 265        context: ExecutionContext,
 266        start: t.Optional[TimeLike] = None,
 267        end: t.Optional[TimeLike] = None,
 268        latest: t.Optional[TimeLike] = None,
 269        **kwargs: t.Any,
 270    ) -> None:
 271        """Runs all pre hooks.
 272
 273        Args:
 274            context: The execution context used for running the hook.
 275            start: The start date/time of the run.
 276            end: The end date/time of the run.
 277            latest: The latest date/time to use for the run.
 278        """
 279        self._run_hooks(self.pre, context=context, start=start, end=end, latest=latest, **kwargs)
 280
 281    def run_post_hooks(
 282        self,
 283        context: ExecutionContext,
 284        start: t.Optional[TimeLike] = None,
 285        end: t.Optional[TimeLike] = None,
 286        latest: t.Optional[TimeLike] = None,
 287        **kwargs: t.Any,
 288    ) -> None:
 289        """Runs all pre hooks.
 290
 291        Args:
 292            context: The execution context used for running the hook.
 293            start: The start date/time of the run.
 294            end: The end date/time of the run.
 295            latest: The latest date/time to use for the run.
 296        """
 297        self._run_hooks(self.post, context=context, start=start, end=end, latest=latest, **kwargs)
 298
 299    def referenced_audits(self, audits: t.Dict[str, Audit]) -> t.List[Audit]:
 300        """Returns audits referenced in this model.
 301
 302        Args:
 303            audits: Available audits by name.
 304        """
 305        from sqlmesh.core.audit import BUILT_IN_AUDITS
 306
 307        referenced_audits = []
 308        for audit_name, _ in self.audits:
 309            if audit_name in audits:
 310                referenced_audits.append(audits[audit_name])
 311            elif audit_name not in BUILT_IN_AUDITS:
 312                raise_config_error(
 313                    f"Unknown audit '{audit_name}' referenced in model '{self.name}'",
 314                    self._path,
 315                )
 316        return referenced_audits
 317
 318    def update_schema(self, schema: MappingSchema) -> None:
 319        """Updates the schema associated with this model.
 320
 321        Args:
 322            schema: The new schema.
 323        """
 324
 325    def text_diff(self, other: Model) -> str:
 326        """Produce a text diff against another model.
 327
 328        Args:
 329            other: The model to diff against.
 330
 331        Returns:
 332            A unified text diff showing additions and deletions.
 333        """
 334        meta_a, *statements_a, query_a = self.render_definition()
 335        meta_b, *statements_b, query_b = other.render_definition()
 336        return "\n".join(
 337            (
 338                d.text_diff(meta_a, meta_b, self.dialect),
 339                *(
 340                    d.text_diff(sa, sb, self.dialect)
 341                    for sa, sb in zip_longest(statements_a, statements_b)
 342                ),
 343                d.text_diff(query_a, query_b, self.dialect),
 344            )
 345        ).strip()
 346
 347    def set_time_format(self, default_time_format: str = c.DEFAULT_TIME_COLUMN_FORMAT) -> None:
 348        """Sets the default time format for a model.
 349
 350        Args:
 351            default_time_format: A python time format used as the default format when none is provided.
 352        """
 353        if not self.time_column:
 354            return
 355
 356        if self.time_column.format:
 357            # Transpile the time column format into the generic dialect
 358            formatted_time = format_time(
 359                self.time_column.format,
 360                d.Dialect.get_or_raise(self.dialect).time_mapping,
 361            )
 362            assert formatted_time is not None
 363            self.time_column.format = formatted_time
 364        else:
 365            self.time_column.format = default_time_format
 366
 367    def convert_to_time_column(self, time: TimeLike) -> exp.Expression:
 368        """Convert a TimeLike object to the same time format and type as the model's time column."""
 369        if self.time_column:
 370            if self.time_column.format:
 371                time = to_datetime(time).strftime(self.time_column.format)
 372
 373            time_column_type = self.columns_to_types[self.time_column.column]
 374            if time_column_type.this in exp.DataType.TEXT_TYPES:
 375                return exp.Literal.string(time)
 376            elif time_column_type.this in exp.DataType.NUMERIC_TYPES:
 377                return exp.Literal.number(time)
 378            elif time_column_type.this in exp.DataType.TEMPORAL_TYPES:
 379                return exp.cast(exp.Literal.string(time), time_column_type)
 380        return exp.convert(time)
 381
 382    @property
 383    def depends_on(self) -> t.Set[str]:
 384        """All of the upstream dependencies referenced in the model's query, excluding self references.
 385
 386        Returns:
 387            A list of all the upstream table names.
 388        """
 389        if self.depends_on_ is not None:
 390            return self.depends_on_
 391
 392        if self._depends_on is None:
 393            self._depends_on = _find_tables(self.render_query()) - {self.name}
 394        return self._depends_on
 395
 396    @property
 397    def columns_to_types(self) -> t.Dict[str, exp.DataType]:
 398        """Returns the mapping of column names to types of this model."""
 399        if self.columns_to_types_ is not None:
 400            return self.columns_to_types_
 401        raise SQLMeshError(f"Column information has not been provided for model '{self.name}'")
 402
 403    @property
 404    def annotated(self) -> bool:
 405        """Checks if all column projection types of this model are known."""
 406        return all(
 407            column_type.this != exp.DataType.Type.UNKNOWN
 408            for column_type in self.columns_to_types.values()
 409        )
 410
 411    @property
 412    def sorted_python_env(self) -> t.List[t.Tuple[str, Executable]]:
 413        """Returns the python env sorted by executable kind and then var name."""
 414        return sorted(self.python_env.items(), key=lambda x: (x[1].kind, x[0]))
 415
 416    @property
 417    def macro_definitions(self) -> t.List[d.MacroDef]:
 418        """All macro definitions from the list of expressions."""
 419        return [s for s in self.expressions if isinstance(s, d.MacroDef)]
 420
 421    @property
 422    def sql_statements(self) -> t.Iterator[exp.Expression]:
 423        """All sql statements from the list of expressions."""
 424        rendered_statements = (
 425            self._expression_renderer(s).render()
 426            for s in self.expressions
 427            if not isinstance(s, d.MacroDef)
 428        )
 429        return (statement for statement in rendered_statements if statement is not None)
 430
 431    @property
 432    def view_name(self) -> str:
 433        return parse_model_name(self.name)[2]
 434
 435    @property
 436    def expressions(self) -> t.List[exp.Expression]:
 437        return self.expressions_ or []
 438
 439    @property
 440    def python_env(self) -> t.Dict[str, Executable]:
 441        return self.python_env_ or {}
 442
 443    @property
 444    def contains_star_query(self) -> bool:
 445        """Returns True if the model's query contains a star projection."""
 446        return False
 447
 448    @property
 449    def is_sql(self) -> bool:
 450        return False
 451
 452    @property
 453    def is_python(self) -> bool:
 454        return False
 455
 456    @property
 457    def is_seed(self) -> bool:
 458        return False
 459
 460    def validate_definition(self) -> None:
 461        """Validates the model's definition.
 462
 463        Model's are not allowed to have duplicate column names, non-explicitly casted columns,
 464        or non infererrable column names.
 465
 466        Raises:
 467            ConfigError
 468        """
 469        if self.partitioned_by:
 470            unique_partition_keys = {k.strip().lower() for k in self.partitioned_by}
 471            if len(self.partitioned_by) != len(unique_partition_keys):
 472                raise_config_error(
 473                    "All partition keys must be unique in the model definition",
 474                    self._path,
 475                )
 476
 477            column_names = {c.lower() for c in self.columns_to_types}
 478            missing_keys = unique_partition_keys - column_names
 479            if missing_keys:
 480                missing_keys_str = ", ".join(f"'{k}'" for k in sorted(missing_keys))
 481                raise_config_error(
 482                    f"Partition keys [{missing_keys_str}] are missing in the model definition",
 483                    self._path,
 484                )
 485
 486        if self.kind.is_incremental_by_time_range and not self.time_column:
 487            raise_config_error(
 488                "Incremental by time range models must have a time_column field.",
 489                self._path,
 490            )
 491
 492    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
 493        """Determines whether this model is a breaking change in relation to the `previous` model.
 494
 495        Args:
 496            previous: The previous model to compare against.
 497
 498        Returns:
 499            True if this model instance represents a breaking change, False if it's a non-breaking change
 500            and None if the nature of the change can't be determined.
 501        """
 502        return None
 503
 504    def _run_hooks(
 505        self,
 506        hooks: t.List[HookCall],
 507        *,
 508        context: ExecutionContext,
 509        start: t.Optional[TimeLike] = None,
 510        end: t.Optional[TimeLike] = None,
 511        latest: t.Optional[TimeLike] = None,
 512        **kwargs: t.Any,
 513    ) -> None:
 514        env = prepare_env(self.python_env)
 515        start, end = make_inclusive(start or c.EPOCH, end or c.EPOCH)
 516        latest = to_datetime(latest or c.EPOCH)
 517
 518        macro_evaluator = MacroEvaluator()
 519
 520        for hook in hooks:
 521            if isinstance(hook, exp.Expression):
 522                rendered = self._expression_renderer(hook).render(
 523                    start=start,
 524                    end=end,
 525                    latest=latest,
 526                    engine_adapter=context.engine_adapter,
 527                    **kwargs,
 528                )
 529                if rendered:
 530                    context.engine_adapter.execute(rendered)
 531            else:
 532                name, hook_kwargs = hook
 533                # Evaluate SQL expressions before passing them into a Python
 534                # function as arguments.
 535                evaluated_hook_kwargs = {
 536                    key: macro_evaluator.eval_expression(value)
 537                    if isinstance(value, exp.Expression)
 538                    else value
 539                    for key, value in hook_kwargs.items()
 540                }
 541                env[name](
 542                    context=context,
 543                    start=start,
 544                    end=end,
 545                    latest=latest,
 546                    **{**kwargs, **evaluated_hook_kwargs},
 547                )
 548
 549    def _expression_renderer(self, expression: exp.Expression) -> ExpressionRenderer:
 550        return ExpressionRenderer(
 551            expression,
 552            self.dialect,
 553            self.macro_definitions,
 554            path=self._path,
 555            jinja_macro_registry=self.jinja_macros,
 556            python_env=self.python_env,
 557        )
 558
 559
 560class SqlModel(_Model):
 561    """The model definition which relies on a SQL query to fetch the data.
 562
 563    Args:
 564        query: The main query representing the model.
 565    """
 566
 567    query: t.Union[exp.Subqueryable, d.Jinja]
 568    source_type: Literal["sql"] = "sql"
 569
 570    _columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None
 571    __query_renderer: t.Optional[QueryRenderer] = None
 572
 573    _query_validator = expression_validator
 574
 575    def render_query(
 576        self,
 577        *,
 578        start: t.Optional[TimeLike] = None,
 579        end: t.Optional[TimeLike] = None,
 580        latest: t.Optional[TimeLike] = None,
 581        snapshots: t.Optional[t.Dict[str, Snapshot]] = None,
 582        expand: t.Iterable[str] = tuple(),
 583        is_dev: bool = False,
 584        engine_adapter: t.Optional[EngineAdapter] = None,
 585        **kwargs: t.Any,
 586    ) -> exp.Subqueryable:
 587        return self._query_renderer.render(
 588            start=start,
 589            end=end,
 590            latest=latest,
 591            add_incremental_filter=True,
 592            snapshots=snapshots,
 593            expand=expand,
 594            is_dev=is_dev,
 595            engine_adapter=engine_adapter,
 596            **kwargs,
 597        )
 598
 599    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
 600        result = super().render_definition(include_python=include_python)
 601        result.append(self.query)
 602        return result
 603
 604    @property
 605    def is_sql(self) -> bool:
 606        return True
 607
 608    @property
 609    def contains_star_query(self) -> bool:
 610        return self._query_renderer.contains_star_query
 611
 612    def update_schema(self, schema: MappingSchema) -> None:
 613        self._query_renderer.update_schema(schema)
 614
 615    @property
 616    def columns_to_types(self) -> t.Dict[str, exp.DataType]:
 617        if self.columns_to_types_ is not None:
 618            return self.columns_to_types_
 619
 620        if self._columns_to_types is None:
 621            self._columns_to_types = {
 622                expression.alias_or_name: expression.type
 623                for expression in self._query_renderer.render().expressions
 624            }
 625
 626        return self._columns_to_types
 627
 628    @property
 629    def column_descriptions(self) -> t.Dict[str, str]:
 630        if self.column_descriptions_ is not None:
 631            return self.column_descriptions_
 632
 633        if self._column_descriptions is None:
 634            self._column_descriptions = {
 635                select.alias: "\n".join(comment.strip() for comment in select.comments)
 636                for select in self.render_query().expressions
 637                if select.comments
 638            }
 639        return self._column_descriptions
 640
 641    def validate_definition(self) -> None:
 642        query = self._query_renderer.render()
 643
 644        if not isinstance(query, exp.Subqueryable):
 645            raise_config_error("Missing SELECT query in the model definition", self._path)
 646
 647        projection_list = (
 648            query.expressions if not isinstance(query, exp.Union) else query.this.expressions
 649        )
 650        if not projection_list:
 651            raise_config_error("Query missing select statements", self._path)
 652
 653        name_counts: t.Dict[str, int] = {}
 654        for expression in projection_list:
 655            alias = expression.alias_or_name
 656            if alias == "*":
 657                continue
 658            if not alias:
 659                raise_config_error(
 660                    f"Outer projection '{expression}' must have inferrable names or explicit aliases.",
 661                    self._path,
 662                )
 663            name_counts[alias] = name_counts.get(alias, 0) + 1
 664
 665        for name, count in name_counts.items():
 666            if count > 1:
 667                raise_config_error(f"Found duplicate outer select name '{name}'", self._path)
 668
 669        super().validate_definition()
 670
 671    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
 672        if not isinstance(previous, SqlModel):
 673            return None
 674
 675        edits = ChangeDistiller(t=0.5).diff(previous.render_query(), self.render_query())
 676        inserted_expressions = {e.expression for e in edits if isinstance(e, Insert)}
 677
 678        for edit in edits:
 679            if isinstance(edit, Insert):
 680                expr = edit.expression
 681                if _is_udtf(expr) or (
 682                    not _is_projection(expr) and expr.parent not in inserted_expressions
 683                ):
 684                    return None
 685            elif not isinstance(edit, Keep):
 686                return None
 687
 688        return False
 689
 690    @property
 691    def _query_renderer(self) -> QueryRenderer:
 692        if self.__query_renderer is None:
 693            self.__query_renderer = QueryRenderer(
 694                self.query,
 695                self.dialect,
 696                self.macro_definitions,
 697                path=self._path,
 698                jinja_macro_registry=self.jinja_macros,
 699                python_env=self.python_env,
 700                time_column=self.time_column,
 701                time_converter=self.convert_to_time_column,
 702                only_latest=self.kind.only_latest,
 703            )
 704        return self.__query_renderer
 705
 706    def __repr__(self) -> str:
 707        return f"Model<name: {self.name}, query: {str(self.query)[0:30]}>"
 708
 709
 710class SeedModel(_Model):
 711    """The model definition which uses a pre-built static dataset to source the data from.
 712
 713    Args:
 714        seed: The content of a pre-built static dataset.
 715    """
 716
 717    kind: SeedKind
 718    seed: Seed
 719    source_type: Literal["seed"] = "seed"
 720
 721    def render(
 722        self,
 723        context: ExecutionContext,
 724        *,
 725        start: t.Optional[TimeLike] = None,
 726        end: t.Optional[TimeLike] = None,
 727        latest: t.Optional[TimeLike] = None,
 728        engine_adapter: t.Optional[EngineAdapter] = None,
 729        **kwargs: t.Any,
 730    ) -> t.Generator[QueryOrDF, None, None]:
 731        yield from self.seed.read(batch_size=self.kind.batch_size)
 732
 733    def text_diff(self, other: Model) -> str:
 734        if not isinstance(other, SeedModel):
 735            return super().text_diff(other)
 736
 737        meta_a = self.render_definition()[0]
 738        meta_b = other.render_definition()[0]
 739        return "\n".join(
 740            (
 741                d.text_diff(meta_a, meta_b, self.dialect),
 742                *unified_diff(
 743                    self.seed.content.split("\n"),
 744                    other.seed.content.split("\n"),
 745                ),
 746            )
 747        ).strip()
 748
 749    @property
 750    def columns_to_types(self) -> t.Dict[str, exp.DataType]:
 751        if self.columns_to_types_ is not None:
 752            return self.columns_to_types_
 753        return self.seed.columns_to_types
 754
 755    @property
 756    def is_seed(self) -> bool:
 757        return True
 758
 759    @property
 760    def seed_path(self) -> Path:
 761        seed_path = Path(self.kind.path)
 762        if not seed_path.is_absolute():
 763            return self._path.parent / seed_path
 764        return seed_path
 765
 766    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
 767        if not isinstance(previous, SeedModel):
 768            return None
 769
 770        new_df = pd.concat([df for df in self.seed.read()])
 771        old_df = pd.concat([df for df in previous.seed.read()])
 772
 773        new_columns = set(new_df.columns)
 774        old_columns = set(old_df.columns)
 775
 776        if not new_columns.issuperset(old_columns):
 777            return None
 778
 779        for col in old_columns:
 780            if new_df[col].dtype != old_df[col].dtype or new_df[col].shape != old_df[col].shape:
 781                return None
 782            elif is_numeric_dtype(new_df[col]):
 783                if not all(np.isclose(new_df[col], old_df[col])):
 784                    return None
 785            else:
 786                if not new_df[col].equals(old_df[col]):
 787                    return None
 788
 789        return False
 790
 791    def __repr__(self) -> str:
 792        return f"Model<name: {self.name}, seed: {self.kind.path}>"
 793
 794
 795class PythonModel(_Model):
 796    """The model definition which relies on a Python script to fetch the data.
 797
 798    Args:
 799        entrypoint: The name of a Python function which contains the data fetching / transformation logic.
 800    """
 801
 802    entrypoint: str
 803    source_type: Literal["python"] = "python"
 804
 805    def render(
 806        self,
 807        context: ExecutionContext,
 808        *,
 809        start: t.Optional[TimeLike] = None,
 810        end: t.Optional[TimeLike] = None,
 811        latest: t.Optional[TimeLike] = None,
 812        engine_adapter: t.Optional[EngineAdapter] = None,
 813        **kwargs: t.Any,
 814    ) -> t.Generator[DF, None, None]:
 815        env = prepare_env(self.python_env)
 816        start, end = make_inclusive(start or c.EPOCH, end or c.EPOCH)
 817        latest = to_datetime(latest or c.EPOCH)
 818        try:
 819            df_or_iter = env[self.entrypoint](
 820                context=context, start=start, end=end, latest=latest, **kwargs
 821            )
 822
 823            if not isinstance(df_or_iter, types.GeneratorType):
 824                df_or_iter = [df_or_iter]
 825
 826            for df in df_or_iter:
 827                if self.kind.is_incremental_by_time_range:
 828                    assert self.time_column
 829
 830                    if PySparkDataFrame is not None and isinstance(df, PySparkDataFrame):
 831                        import pyspark
 832
 833                        df = df.where(
 834                            pyspark.sql.functions.col(self.time_column.column).between(
 835                                pyspark.sql.functions.lit(
 836                                    self.convert_to_time_column(start).sql("spark")
 837                                ),
 838                                pyspark.sql.functions.lit(
 839                                    self.convert_to_time_column(end).sql("spark")
 840                                ),
 841                            )
 842                        )
 843                    else:
 844                        assert self.time_column.format, "Time column format is required."
 845                        df = filter_df_by_timelike(
 846                            df, self.time_column.column, self.time_column.format, start, end
 847                        )
 848                yield df
 849        except Exception as e:
 850            print_exception(e, self.python_env)
 851            raise SQLMeshError(f"Error executing Python model '{self.name}'")
 852
 853    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
 854        # Ignore the provided value for the include_python flag, since the Pyhon model's
 855        # definition without Python code is meaningless.
 856        return super().render_definition(include_python=True)
 857
 858    @property
 859    def is_python(self) -> bool:
 860        return True
 861
 862    def __repr__(self) -> str:
 863        return f"Model<name: {self.name}, entrypoint: {self.entrypoint}>"
 864
 865
 866Model = Annotated[t.Union[SqlModel, SeedModel, PythonModel], Field(discriminator="source_type")]
 867
 868
 869def load_model(
 870    expressions: t.List[exp.Expression],
 871    *,
 872    defaults: t.Optional[t.Dict[str, t.Any]] = None,
 873    path: Path = Path(),
 874    module_path: Path = Path(),
 875    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
 876    macros: t.Optional[MacroRegistry] = None,
 877    hooks: t.Optional[HookRegistry] = None,
 878    python_env: t.Optional[t.Dict[str, Executable]] = None,
 879    dialect: t.Optional[str] = None,
 880    **kwargs: t.Any,
 881) -> Model:
 882    """Load a model from a parsed SQLMesh model file.
 883
 884    Args:
 885        expressions: Model, *Statements, Query.
 886        defaults: Definition default values.
 887        path: An optional path to the model definition file.
 888        module_path: The python module path to serialize macros for.
 889        time_column_format: The default time column format to use if no model time column is configured.
 890        macros: The custom registry of macros. If not provided the default registry will be used.
 891        hooks: The custom registry of hooks. If not provided the default registry will be used.
 892        python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed
 893            from the macro registry.
 894        dialect: The default dialect if no model dialect is configured.
 895            The format must adhere to Python's strftime codes.
 896        kwargs: Additional kwargs to pass to the loader.
 897    """
 898    if not expressions:
 899        raise_config_error("Incomplete model definition, missing MODEL statement", path)
 900
 901    dialect = dialect or ""
 902    meta = expressions[0]
 903    query = expressions[-1] if len(expressions) > 1 else None
 904    statements = expressions[1:-1]
 905
 906    if not isinstance(meta, d.Model):
 907        raise_config_error(
 908            "MODEL statement is required as the first statement in the definition",
 909            path,
 910        )
 911
 912    meta_fields: t.Dict[str, t.Any] = {
 913        "dialect": dialect,
 914        "description": "\n".join(comment.strip() for comment in meta.comments)
 915        if meta.comments
 916        else None,
 917        **{prop.name.lower(): prop.args.get("value") for prop in meta.expressions},
 918        **kwargs,
 919    }
 920
 921    name = meta_fields.pop("name", "")
 922    if not name:
 923        raise_config_error("Model must have a name", path)
 924
 925    if isinstance(query, d.MacroVar):
 926        if python_env is None:
 927            raise_config_error("The python environment must be provided for Python models", path)
 928            raise
 929
 930        return create_python_model(
 931            name,
 932            query.name,
 933            python_env,
 934            defaults=defaults,
 935            path=path,
 936            time_column_format=time_column_format,
 937            **meta_fields,
 938        )
 939    elif query is not None:
 940        return create_sql_model(
 941            name,
 942            query,
 943            statements=statements,
 944            defaults=defaults,
 945            path=path,
 946            module_path=module_path,
 947            time_column_format=time_column_format,
 948            macros=macros,
 949            hooks=hooks,
 950            python_env=python_env,
 951            **meta_fields,
 952        )
 953    else:
 954        try:
 955            seed_properties = {
 956                p.name.lower(): p.args.get("value") for p in meta_fields.pop("kind").expressions
 957            }
 958            return create_seed_model(
 959                name,
 960                SeedKind(**seed_properties),
 961                defaults=defaults,
 962                path=path,
 963                **meta_fields,
 964            )
 965        except Exception:
 966            raise_config_error(
 967                "The model definition must either have a SELECT query or a valid Seed kind",
 968                path,
 969            )
 970            raise
 971
 972
 973def create_sql_model(
 974    name: str,
 975    query: exp.Expression,
 976    *,
 977    statements: t.Optional[t.List[exp.Expression]] = None,
 978    defaults: t.Optional[t.Dict[str, t.Any]] = None,
 979    path: Path = Path(),
 980    module_path: Path = Path(),
 981    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
 982    macros: t.Optional[MacroRegistry] = None,
 983    hooks: t.Optional[HookRegistry] = None,
 984    python_env: t.Optional[t.Dict[str, Executable]] = None,
 985    dialect: t.Optional[str] = None,
 986    **kwargs: t.Any,
 987) -> Model:
 988    """Creates a SQL model.
 989
 990    Args:
 991        name: The name of the model, which is of the form [catalog].[db].table.
 992            The catalog and db are optional.
 993        query: The model's logic in a form of a SELECT query.
 994        statements: The list of all SQL statements that are not a query or a model definition.
 995        defaults: Definition default values.
 996        path: An optional path to the model definition file.
 997        module_path: The python module path to serialize macros for.
 998        time_column_format: The default time column format to use if no model time column is configured.
 999        macros: The custom registry of macros. If not provided the default registry will be used.
1000        hooks: The custom registry of hooks. If not provided the default registry will be used.
1001        python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed
1002            from the macro registry.
1003        dialect: The default dialect if no model dialect is configured.
1004            The format must adhere to Python's strftime codes.
1005    """
1006    if not isinstance(query, (exp.Subqueryable, d.Jinja)):
1007        raise_config_error(
1008            "A query is required and must be a SELECT or UNION statement.",
1009            path,
1010        )
1011
1012    if not python_env:
1013        python_env = _python_env(
1014            query,
1015            _extract_hooks(kwargs),
1016            module_path,
1017            macros or macro.get_registry(),
1018            hooks or hook.get_registry(),
1019        )
1020
1021    return _create_model(
1022        SqlModel,
1023        name,
1024        defaults=defaults,
1025        path=path,
1026        time_column_format=time_column_format,
1027        python_env=python_env,
1028        dialect=dialect,
1029        expressions=statements or [],
1030        query=query,
1031        **kwargs,
1032    )
1033
1034
1035def create_seed_model(
1036    name: str,
1037    seed_kind: SeedKind,
1038    *,
1039    defaults: t.Optional[t.Dict[str, t.Any]] = None,
1040    path: Path = Path(),
1041    **kwargs: t.Any,
1042) -> Model:
1043    """Creates a Seed model.
1044
1045    Args:
1046        name: The name of the model, which is of the form [catalog].[db].table.
1047            The catalog and db are optional.
1048        seed_kind: The information about the location of a seed and other related configuration.
1049        defaults: Definition default values.
1050        path: An optional path to the model definition file.
1051    """
1052    seed_path = Path(seed_kind.path)
1053    if not seed_path.is_absolute():
1054        seed_path = path / seed_path if path.is_dir() else path.parents[0] / seed_path
1055    seed = create_seed(seed_path)
1056    return _create_model(
1057        SeedModel,
1058        name,
1059        defaults=defaults,
1060        path=path,
1061        seed=seed,
1062        kind=seed_kind,
1063        **kwargs,
1064    )
1065
1066
1067def create_python_model(
1068    name: str,
1069    entrypoint: str,
1070    python_env: t.Dict[str, Executable],
1071    *,
1072    defaults: t.Optional[t.Dict[str, t.Any]] = None,
1073    path: Path = Path(),
1074    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
1075    depends_on: t.Optional[t.Set[str]] = None,
1076    **kwargs: t.Any,
1077) -> Model:
1078    """Creates a Python model.
1079
1080    Args:
1081        name: The name of the model, which is of the form [catalog].[db].table.
1082            The catalog and db are optional.
1083        entrypoint: The name of a Python function which contains the data fetching / transformation logic.
1084        python_env: The Python environment of all objects referenced by the model implementation.
1085        defaults: Definition default values.
1086        path: An optional path to the model definition file.
1087        time_column_format: The default time column format to use if no model time column is configured.
1088        depends_on: The custom set of model's upstream dependencies.
1089    """
1090    # Find dependencies for python models by parsing code if they are not explicitly defined
1091    depends_on = (
1092        _parse_depends_on(entrypoint, python_env)
1093        if depends_on is None and python_env is not None
1094        else None
1095    )
1096    return _create_model(
1097        PythonModel,
1098        name,
1099        defaults=defaults,
1100        path=path,
1101        time_column_format=time_column_format,
1102        depends_on=depends_on,
1103        entrypoint=entrypoint,
1104        python_env=python_env,
1105        **kwargs,
1106    )
1107
1108
1109def _create_model(
1110    klass: t.Type[_Model],
1111    name: str,
1112    *,
1113    defaults: t.Optional[t.Dict[str, t.Any]] = None,
1114    path: Path = Path(),
1115    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
1116    depends_on: t.Optional[t.Set[str]] = None,
1117    dialect: t.Optional[str] = None,
1118    expressions: t.Optional[t.List[exp.Expression]] = None,
1119    **kwargs: t.Any,
1120) -> Model:
1121    _validate_model_fields(klass, {"name", *kwargs}, path)
1122
1123    dialect = dialect or ""
1124
1125    try:
1126        model = klass(
1127            name=name,
1128            expressions=expressions or [],
1129            **{
1130                **(defaults or {}),
1131                "dialect": dialect,
1132                "depends_on": depends_on,
1133                **kwargs,
1134            },
1135        )
1136    except Exception as ex:
1137        raise_config_error(str(ex), location=path)
1138        raise
1139
1140    model._path = path
1141    model.set_time_format(time_column_format)
1142    model.validate_definition()
1143
1144    return t.cast(Model, model)
1145
1146
1147def _validate_model_fields(klass: t.Type[_Model], provided_fields: t.Set[str], path: Path) -> None:
1148    missing_required_fields = klass.missing_required_fields(provided_fields)
1149    if missing_required_fields:
1150        raise_config_error(
1151            f"Missing required fields {missing_required_fields} in the model definition",
1152            path,
1153        )
1154
1155    extra_fields = klass.extra_fields(provided_fields)
1156    if extra_fields:
1157        raise_config_error(f"Invalid extra fields {extra_fields} in the model definition", path)
1158
1159
1160def _find_tables(query: exp.Expression) -> t.Set[str]:
1161    """Find all tables referenced in a query.
1162
1163    Args:
1164        query: The expression to find tables for.
1165
1166    Returns:
1167        A Set of all the table names.
1168    """
1169    return {
1170        exp.table_name(table)
1171        for scope in traverse_scope(query)
1172        for table in scope.tables
1173        if isinstance(table.this, exp.Identifier) and exp.table_name(table) not in scope.cte_sources
1174    }
1175
1176
1177def _python_env(
1178    query: exp.Expression,
1179    hook_calls: t.List[HookCall],
1180    module_path: Path,
1181    macros: MacroRegistry,
1182    hooks: HookRegistry,
1183) -> t.Dict[str, Executable]:
1184    python_env: t.Dict[str, Executable] = {}
1185
1186    used_macros = {}
1187
1188    def _capture_expression_macros(expression: exp.Expression) -> None:
1189        if isinstance(expression, d.Jinja):
1190            for var in expression.expressions:
1191                if var in macros:
1192                    used_macros[var] = macros[var]
1193        else:
1194            for macro_func in expression.find_all(d.MacroFunc):
1195                if macro_func.__class__ is d.MacroFunc:
1196                    name = macro_func.this.name.lower()
1197                    used_macros[name] = macros[name]
1198
1199    _capture_expression_macros(query)
1200
1201    for hook in hook_calls:
1202        if isinstance(hook, exp.Expression):
1203            _capture_expression_macros(hook)
1204        else:
1205            name = hook[0]
1206            build_env(
1207                hooks[name].func,
1208                env=python_env,
1209                name=name,
1210                path=module_path,
1211            )
1212
1213    for name, macro in used_macros.items():
1214        if not macro.func.__module__.startswith("sqlmesh."):
1215            build_env(
1216                macro.func,
1217                env=python_env,
1218                name=name,
1219                path=module_path,
1220            )
1221
1222    return serialize_env(python_env, path=module_path)
1223
1224
1225def _parse_depends_on(model_func: str, python_env: t.Dict[str, Executable]) -> t.Set[str]:
1226    """Parses the source of a model function and finds upstream dependencies based on calls to context."""
1227    env = prepare_env(python_env)
1228    depends_on = set()
1229    executable = python_env[model_func]
1230
1231    for node in ast.walk(ast.parse(executable.payload)):
1232        if not isinstance(node, ast.Call):
1233            continue
1234
1235        func = node.func
1236
1237        if (
1238            isinstance(func, ast.Attribute)
1239            and isinstance(func.value, ast.Name)
1240            and func.value.id == "context"
1241            and func.attr == "table"
1242        ):
1243            if node.args:
1244                table: t.Optional[ast.expr] = node.args[0]
1245            else:
1246                table = next(
1247                    (keyword.value for keyword in node.keywords if keyword.arg == "model_name"),
1248                    None,
1249                )
1250
1251            try:
1252                expression = to_source(table)
1253                depends_on.add(eval(expression, env))
1254            except Exception:
1255                raise ConfigError(
1256                    f"Error resolving dependencies for '{executable.path}'. References to context must be resolvable at parse time.\n\n{expression}"
1257                )
1258
1259    return depends_on
1260
1261
1262def _extract_hooks(kwargs: t.Dict[str, t.Any]) -> t.List[HookCall]:
1263    return (ModelMeta._value_or_tuple_with_args_validator(kwargs.get("pre")) or []) + (
1264        ModelMeta._value_or_tuple_with_args_validator(kwargs.get("post")) or []
1265    )
1266
1267
1268def _list_of_calls_to_exp(value: t.List[t.Tuple[str, t.Dict[str, t.Any]]]) -> exp.Expression:
1269    return exp.Tuple(
1270        expressions=[
1271            exp.Anonymous(
1272                this=v[0],
1273                expressions=[
1274                    exp.EQ(this=exp.convert(left), expression=exp.convert(right))
1275                    for left, right in v[1].items()
1276                ],
1277            )
1278            for v in value
1279        ]
1280    )
1281
1282
1283def _is_projection(expr: exp.Expression) -> bool:
1284    parent = expr.parent
1285    return isinstance(parent, exp.Select) and expr in parent.expressions
1286
1287
1288def _is_udtf(expr: exp.Expression) -> bool:
1289    return isinstance(expr, (exp.Explode, exp.Posexplode, exp.Unnest)) or (
1290        isinstance(expr, exp.Anonymous)
1291        and expr.this.upper() in ("EXPLODE_OUTER", "POSEXPLODE_OUTER", "UNNEST")
1292    )
1293
1294
1295META_FIELD_CONVERTER: t.Dict[str, t.Callable] = {
1296    "name": lambda value: exp.to_table(value),
1297    "start": lambda value: exp.Literal.string(value),
1298    "cron": lambda value: exp.Literal.string(value),
1299    "batch_size": lambda value: exp.Literal.number(value),
1300    "partitioned_by_": lambda value: (
1301        exp.to_identifier(value[0]) if len(value) == 1 else exp.Tuple(expressions=value)
1302    ),
1303    "depends_on_": lambda value: exp.Tuple(expressions=value),
1304    "pre": _list_of_calls_to_exp,
1305    "post": _list_of_calls_to_exp,
1306    "audits": _list_of_calls_to_exp,
1307    "columns_to_types_": lambda value: exp.Schema(
1308        expressions=[exp.ColumnDef(this=exp.to_column(c), kind=t) for c, t in value.items()]
1309    ),
1310}
class SqlModel(_Model):
561class SqlModel(_Model):
562    """The model definition which relies on a SQL query to fetch the data.
563
564    Args:
565        query: The main query representing the model.
566    """
567
568    query: t.Union[exp.Subqueryable, d.Jinja]
569    source_type: Literal["sql"] = "sql"
570
571    _columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None
572    __query_renderer: t.Optional[QueryRenderer] = None
573
574    _query_validator = expression_validator
575
576    def render_query(
577        self,
578        *,
579        start: t.Optional[TimeLike] = None,
580        end: t.Optional[TimeLike] = None,
581        latest: t.Optional[TimeLike] = None,
582        snapshots: t.Optional[t.Dict[str, Snapshot]] = None,
583        expand: t.Iterable[str] = tuple(),
584        is_dev: bool = False,
585        engine_adapter: t.Optional[EngineAdapter] = None,
586        **kwargs: t.Any,
587    ) -> exp.Subqueryable:
588        return self._query_renderer.render(
589            start=start,
590            end=end,
591            latest=latest,
592            add_incremental_filter=True,
593            snapshots=snapshots,
594            expand=expand,
595            is_dev=is_dev,
596            engine_adapter=engine_adapter,
597            **kwargs,
598        )
599
600    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
601        result = super().render_definition(include_python=include_python)
602        result.append(self.query)
603        return result
604
605    @property
606    def is_sql(self) -> bool:
607        return True
608
609    @property
610    def contains_star_query(self) -> bool:
611        return self._query_renderer.contains_star_query
612
613    def update_schema(self, schema: MappingSchema) -> None:
614        self._query_renderer.update_schema(schema)
615
616    @property
617    def columns_to_types(self) -> t.Dict[str, exp.DataType]:
618        if self.columns_to_types_ is not None:
619            return self.columns_to_types_
620
621        if self._columns_to_types is None:
622            self._columns_to_types = {
623                expression.alias_or_name: expression.type
624                for expression in self._query_renderer.render().expressions
625            }
626
627        return self._columns_to_types
628
629    @property
630    def column_descriptions(self) -> t.Dict[str, str]:
631        if self.column_descriptions_ is not None:
632            return self.column_descriptions_
633
634        if self._column_descriptions is None:
635            self._column_descriptions = {
636                select.alias: "\n".join(comment.strip() for comment in select.comments)
637                for select in self.render_query().expressions
638                if select.comments
639            }
640        return self._column_descriptions
641
642    def validate_definition(self) -> None:
643        query = self._query_renderer.render()
644
645        if not isinstance(query, exp.Subqueryable):
646            raise_config_error("Missing SELECT query in the model definition", self._path)
647
648        projection_list = (
649            query.expressions if not isinstance(query, exp.Union) else query.this.expressions
650        )
651        if not projection_list:
652            raise_config_error("Query missing select statements", self._path)
653
654        name_counts: t.Dict[str, int] = {}
655        for expression in projection_list:
656            alias = expression.alias_or_name
657            if alias == "*":
658                continue
659            if not alias:
660                raise_config_error(
661                    f"Outer projection '{expression}' must have inferrable names or explicit aliases.",
662                    self._path,
663                )
664            name_counts[alias] = name_counts.get(alias, 0) + 1
665
666        for name, count in name_counts.items():
667            if count > 1:
668                raise_config_error(f"Found duplicate outer select name '{name}'", self._path)
669
670        super().validate_definition()
671
672    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
673        if not isinstance(previous, SqlModel):
674            return None
675
676        edits = ChangeDistiller(t=0.5).diff(previous.render_query(), self.render_query())
677        inserted_expressions = {e.expression for e in edits if isinstance(e, Insert)}
678
679        for edit in edits:
680            if isinstance(edit, Insert):
681                expr = edit.expression
682                if _is_udtf(expr) or (
683                    not _is_projection(expr) and expr.parent not in inserted_expressions
684                ):
685                    return None
686            elif not isinstance(edit, Keep):
687                return None
688
689        return False
690
691    @property
692    def _query_renderer(self) -> QueryRenderer:
693        if self.__query_renderer is None:
694            self.__query_renderer = QueryRenderer(
695                self.query,
696                self.dialect,
697                self.macro_definitions,
698                path=self._path,
699                jinja_macro_registry=self.jinja_macros,
700                python_env=self.python_env,
701                time_column=self.time_column,
702                time_converter=self.convert_to_time_column,
703                only_latest=self.kind.only_latest,
704            )
705        return self.__query_renderer
706
707    def __repr__(self) -> str:
708        return f"Model<name: {self.name}, query: {str(self.query)[0:30]}>"

The model definition which relies on a SQL query to fetch the data.

Arguments:
  • query: The main query representing the model.
def render_query( self, *, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, latest: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, snapshots: Optional[Dict[str, <MagicMock id='5996436528'>]] = None, expand: Iterable[str] = (), is_dev: bool = False, engine_adapter: Optional[sqlmesh.core.engine_adapter.base.EngineAdapter] = None, **kwargs: Any) -> sqlglot.expressions.Subqueryable:
576    def render_query(
577        self,
578        *,
579        start: t.Optional[TimeLike] = None,
580        end: t.Optional[TimeLike] = None,
581        latest: t.Optional[TimeLike] = None,
582        snapshots: t.Optional[t.Dict[str, Snapshot]] = None,
583        expand: t.Iterable[str] = tuple(),
584        is_dev: bool = False,
585        engine_adapter: t.Optional[EngineAdapter] = None,
586        **kwargs: t.Any,
587    ) -> exp.Subqueryable:
588        return self._query_renderer.render(
589            start=start,
590            end=end,
591            latest=latest,
592            add_incremental_filter=True,
593            snapshots=snapshots,
594            expand=expand,
595            is_dev=is_dev,
596            engine_adapter=engine_adapter,
597            **kwargs,
598        )

Renders a model's query, expanding macros with provided kwargs, and optionally expanding referenced models.

Arguments:
  • start: The start datetime to render. Defaults to epoch start.
  • end: The end datetime to render. Defaults to epoch start.
  • latest: The latest datetime to use for non-incremental queries. Defaults to epoch start.
  • snapshots: All upstream snapshots (by model name) to use for expansion and mapping of physical locations.
  • expand: Expand referenced models as subqueries. This is used to bypass backfills when running queries that depend on materialized tables. Model definitions are inlined and can thus be run end to end on the fly.
  • audit_name: The name of audit if the query to render is for an audit.
  • is_dev: Indicates whether the rendering happens in the development mode and temporary tables / table clones should be used where applicable.
  • kwargs: Additional kwargs to pass to the renderer.
Returns:

The rendered expression.

def render_definition( self, include_python: bool = True) -> List[sqlglot.expressions.Expression]:
600    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
601        result = super().render_definition(include_python=include_python)
602        result.append(self.query)
603        return result

Returns the original list of sql expressions comprising the model definition.

Arguments:
  • include_python: Whether or not to include Python code in the rendered definition.
contains_star_query: bool

Returns True if the model's query contains a star projection.

def update_schema(self, schema: sqlglot.schema.MappingSchema) -> None:
613    def update_schema(self, schema: MappingSchema) -> None:
614        self._query_renderer.update_schema(schema)

Updates the schema associated with this model.

Arguments:
  • schema: The new schema.
columns_to_types: Dict[str, sqlglot.expressions.DataType]

Returns the mapping of column names to types of this model.

column_descriptions: Dict[str, str]

A dictionary of column names to annotation comments.

def validate_definition(self) -> None:
642    def validate_definition(self) -> None:
643        query = self._query_renderer.render()
644
645        if not isinstance(query, exp.Subqueryable):
646            raise_config_error("Missing SELECT query in the model definition", self._path)
647
648        projection_list = (
649            query.expressions if not isinstance(query, exp.Union) else query.this.expressions
650        )
651        if not projection_list:
652            raise_config_error("Query missing select statements", self._path)
653
654        name_counts: t.Dict[str, int] = {}
655        for expression in projection_list:
656            alias = expression.alias_or_name
657            if alias == "*":
658                continue
659            if not alias:
660                raise_config_error(
661                    f"Outer projection '{expression}' must have inferrable names or explicit aliases.",
662                    self._path,
663                )
664            name_counts[alias] = name_counts.get(alias, 0) + 1
665
666        for name, count in name_counts.items():
667            if count > 1:
668                raise_config_error(f"Found duplicate outer select name '{name}'", self._path)
669
670        super().validate_definition()

Validates the model's definition.

Model's are not allowed to have duplicate column names, non-explicitly casted columns, or non infererrable column names.

Raises:
  • ConfigError
def is_breaking_change( self, previous: Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]) -> Optional[bool]:
672    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
673        if not isinstance(previous, SqlModel):
674            return None
675
676        edits = ChangeDistiller(t=0.5).diff(previous.render_query(), self.render_query())
677        inserted_expressions = {e.expression for e in edits if isinstance(e, Insert)}
678
679        for edit in edits:
680            if isinstance(edit, Insert):
681                expr = edit.expression
682                if _is_udtf(expr) or (
683                    not _is_projection(expr) and expr.parent not in inserted_expressions
684                ):
685                    return None
686            elif not isinstance(edit, Keep):
687                return None
688
689        return False

Determines whether this model is a breaking change in relation to the previous model.

Arguments:
  • previous: The previous model to compare against.
Returns:

True if this model instance represents a breaking change, False if it's a non-breaking change and None if the nature of the change can't be determined.

class SeedModel(_Model):
711class SeedModel(_Model):
712    """The model definition which uses a pre-built static dataset to source the data from.
713
714    Args:
715        seed: The content of a pre-built static dataset.
716    """
717
718    kind: SeedKind
719    seed: Seed
720    source_type: Literal["seed"] = "seed"
721
722    def render(
723        self,
724        context: ExecutionContext,
725        *,
726        start: t.Optional[TimeLike] = None,
727        end: t.Optional[TimeLike] = None,
728        latest: t.Optional[TimeLike] = None,
729        engine_adapter: t.Optional[EngineAdapter] = None,
730        **kwargs: t.Any,
731    ) -> t.Generator[QueryOrDF, None, None]:
732        yield from self.seed.read(batch_size=self.kind.batch_size)
733
734    def text_diff(self, other: Model) -> str:
735        if not isinstance(other, SeedModel):
736            return super().text_diff(other)
737
738        meta_a = self.render_definition()[0]
739        meta_b = other.render_definition()[0]
740        return "\n".join(
741            (
742                d.text_diff(meta_a, meta_b, self.dialect),
743                *unified_diff(
744                    self.seed.content.split("\n"),
745                    other.seed.content.split("\n"),
746                ),
747            )
748        ).strip()
749
750    @property
751    def columns_to_types(self) -> t.Dict[str, exp.DataType]:
752        if self.columns_to_types_ is not None:
753            return self.columns_to_types_
754        return self.seed.columns_to_types
755
756    @property
757    def is_seed(self) -> bool:
758        return True
759
760    @property
761    def seed_path(self) -> Path:
762        seed_path = Path(self.kind.path)
763        if not seed_path.is_absolute():
764            return self._path.parent / seed_path
765        return seed_path
766
767    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
768        if not isinstance(previous, SeedModel):
769            return None
770
771        new_df = pd.concat([df for df in self.seed.read()])
772        old_df = pd.concat([df for df in previous.seed.read()])
773
774        new_columns = set(new_df.columns)
775        old_columns = set(old_df.columns)
776
777        if not new_columns.issuperset(old_columns):
778            return None
779
780        for col in old_columns:
781            if new_df[col].dtype != old_df[col].dtype or new_df[col].shape != old_df[col].shape:
782                return None
783            elif is_numeric_dtype(new_df[col]):
784                if not all(np.isclose(new_df[col], old_df[col])):
785                    return None
786            else:
787                if not new_df[col].equals(old_df[col]):
788                    return None
789
790        return False
791
792    def __repr__(self) -> str:
793        return f"Model<name: {self.name}, seed: {self.kind.path}>"

The model definition which uses a pre-built static dataset to source the data from.

Arguments:
  • seed: The content of a pre-built static dataset.
def render( self, context: sqlmesh.core.context.ExecutionContext, *, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, latest: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, engine_adapter: Optional[sqlmesh.core.engine_adapter.base.EngineAdapter] = None, **kwargs: Any) -> Generator[<MagicMock id='6015216672'>, NoneType, NoneType]:
722    def render(
723        self,
724        context: ExecutionContext,
725        *,
726        start: t.Optional[TimeLike] = None,
727        end: t.Optional[TimeLike] = None,
728        latest: t.Optional[TimeLike] = None,
729        engine_adapter: t.Optional[EngineAdapter] = None,
730        **kwargs: t.Any,
731    ) -> t.Generator[QueryOrDF, None, None]:
732        yield from self.seed.read(batch_size=self.kind.batch_size)

Renders the content of this model in a form of either a SELECT query, executing which the data for this model can be fetched, or a dataframe object which contains the data itself.

The type of the returned object (query or dataframe) depends on whether the model was sourced from a SQL query, a Python script or a pre-built dataset (seed).

Arguments:
  • context: The execution context used for fetching data.
  • start: The start date/time of the run.
  • end: The end date/time of the run.
  • latest: The latest date/time to use for the run.
Returns:

A generator which yields eiether a query object or one of the supported dataframe objects.

def text_diff( self, other: Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]) -> str:
734    def text_diff(self, other: Model) -> str:
735        if not isinstance(other, SeedModel):
736            return super().text_diff(other)
737
738        meta_a = self.render_definition()[0]
739        meta_b = other.render_definition()[0]
740        return "\n".join(
741            (
742                d.text_diff(meta_a, meta_b, self.dialect),
743                *unified_diff(
744                    self.seed.content.split("\n"),
745                    other.seed.content.split("\n"),
746                ),
747            )
748        ).strip()

Produce a text diff against another model.

Arguments:
  • other: The model to diff against.
Returns:

A unified text diff showing additions and deletions.

columns_to_types: Dict[str, sqlglot.expressions.DataType]

Returns the mapping of column names to types of this model.

def is_breaking_change( self, previous: Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]) -> Optional[bool]:
767    def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
768        if not isinstance(previous, SeedModel):
769            return None
770
771        new_df = pd.concat([df for df in self.seed.read()])
772        old_df = pd.concat([df for df in previous.seed.read()])
773
774        new_columns = set(new_df.columns)
775        old_columns = set(old_df.columns)
776
777        if not new_columns.issuperset(old_columns):
778            return None
779
780        for col in old_columns:
781            if new_df[col].dtype != old_df[col].dtype or new_df[col].shape != old_df[col].shape:
782                return None
783            elif is_numeric_dtype(new_df[col]):
784                if not all(np.isclose(new_df[col], old_df[col])):
785                    return None
786            else:
787                if not new_df[col].equals(old_df[col]):
788                    return None
789
790        return False

Determines whether this model is a breaking change in relation to the previous model.

Arguments:
  • previous: The previous model to compare against.
Returns:

True if this model instance represents a breaking change, False if it's a non-breaking change and None if the nature of the change can't be determined.

class PythonModel(_Model):
796class PythonModel(_Model):
797    """The model definition which relies on a Python script to fetch the data.
798
799    Args:
800        entrypoint: The name of a Python function which contains the data fetching / transformation logic.
801    """
802
803    entrypoint: str
804    source_type: Literal["python"] = "python"
805
806    def render(
807        self,
808        context: ExecutionContext,
809        *,
810        start: t.Optional[TimeLike] = None,
811        end: t.Optional[TimeLike] = None,
812        latest: t.Optional[TimeLike] = None,
813        engine_adapter: t.Optional[EngineAdapter] = None,
814        **kwargs: t.Any,
815    ) -> t.Generator[DF, None, None]:
816        env = prepare_env(self.python_env)
817        start, end = make_inclusive(start or c.EPOCH, end or c.EPOCH)
818        latest = to_datetime(latest or c.EPOCH)
819        try:
820            df_or_iter = env[self.entrypoint](
821                context=context, start=start, end=end, latest=latest, **kwargs
822            )
823
824            if not isinstance(df_or_iter, types.GeneratorType):
825                df_or_iter = [df_or_iter]
826
827            for df in df_or_iter:
828                if self.kind.is_incremental_by_time_range:
829                    assert self.time_column
830
831                    if PySparkDataFrame is not None and isinstance(df, PySparkDataFrame):
832                        import pyspark
833
834                        df = df.where(
835                            pyspark.sql.functions.col(self.time_column.column).between(
836                                pyspark.sql.functions.lit(
837                                    self.convert_to_time_column(start).sql("spark")
838                                ),
839                                pyspark.sql.functions.lit(
840                                    self.convert_to_time_column(end).sql("spark")
841                                ),
842                            )
843                        )
844                    else:
845                        assert self.time_column.format, "Time column format is required."
846                        df = filter_df_by_timelike(
847                            df, self.time_column.column, self.time_column.format, start, end
848                        )
849                yield df
850        except Exception as e:
851            print_exception(e, self.python_env)
852            raise SQLMeshError(f"Error executing Python model '{self.name}'")
853
854    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
855        # Ignore the provided value for the include_python flag, since the Pyhon model's
856        # definition without Python code is meaningless.
857        return super().render_definition(include_python=True)
858
859    @property
860    def is_python(self) -> bool:
861        return True
862
863    def __repr__(self) -> str:
864        return f"Model<name: {self.name}, entrypoint: {self.entrypoint}>"

The model definition which relies on a Python script to fetch the data.

Arguments:
  • entrypoint: The name of a Python function which contains the data fetching / transformation logic.
def render( self, context: sqlmesh.core.context.ExecutionContext, *, start: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, end: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, latest: Union[datetime.date, datetime.datetime, str, int, float, NoneType] = None, engine_adapter: Optional[sqlmesh.core.engine_adapter.base.EngineAdapter] = None, **kwargs: Any) -> Generator[<MagicMock id='6016325024'>, NoneType, NoneType]:
806    def render(
807        self,
808        context: ExecutionContext,
809        *,
810        start: t.Optional[TimeLike] = None,
811        end: t.Optional[TimeLike] = None,
812        latest: t.Optional[TimeLike] = None,
813        engine_adapter: t.Optional[EngineAdapter] = None,
814        **kwargs: t.Any,
815    ) -> t.Generator[DF, None, None]:
816        env = prepare_env(self.python_env)
817        start, end = make_inclusive(start or c.EPOCH, end or c.EPOCH)
818        latest = to_datetime(latest or c.EPOCH)
819        try:
820            df_or_iter = env[self.entrypoint](
821                context=context, start=start, end=end, latest=latest, **kwargs
822            )
823
824            if not isinstance(df_or_iter, types.GeneratorType):
825                df_or_iter = [df_or_iter]
826
827            for df in df_or_iter:
828                if self.kind.is_incremental_by_time_range:
829                    assert self.time_column
830
831                    if PySparkDataFrame is not None and isinstance(df, PySparkDataFrame):
832                        import pyspark
833
834                        df = df.where(
835                            pyspark.sql.functions.col(self.time_column.column).between(
836                                pyspark.sql.functions.lit(
837                                    self.convert_to_time_column(start).sql("spark")
838                                ),
839                                pyspark.sql.functions.lit(
840                                    self.convert_to_time_column(end).sql("spark")
841                                ),
842                            )
843                        )
844                    else:
845                        assert self.time_column.format, "Time column format is required."
846                        df = filter_df_by_timelike(
847                            df, self.time_column.column, self.time_column.format, start, end
848                        )
849                yield df
850        except Exception as e:
851            print_exception(e, self.python_env)
852            raise SQLMeshError(f"Error executing Python model '{self.name}'")

Renders the content of this model in a form of either a SELECT query, executing which the data for this model can be fetched, or a dataframe object which contains the data itself.

The type of the returned object (query or dataframe) depends on whether the model was sourced from a SQL query, a Python script or a pre-built dataset (seed).

Arguments:
  • context: The execution context used for fetching data.
  • start: The start date/time of the run.
  • end: The end date/time of the run.
  • latest: The latest date/time to use for the run.
Returns:

A generator which yields eiether a query object or one of the supported dataframe objects.

def render_definition( self, include_python: bool = True) -> List[sqlglot.expressions.Expression]:
854    def render_definition(self, include_python: bool = True) -> t.List[exp.Expression]:
855        # Ignore the provided value for the include_python flag, since the Pyhon model's
856        # definition without Python code is meaningless.
857        return super().render_definition(include_python=True)

Returns the original list of sql expressions comprising the model definition.

Arguments:
  • include_python: Whether or not to include Python code in the rendered definition.
def load_model( expressions: List[sqlglot.expressions.Expression], *, defaults: Optional[Dict[str, Any]] = None, path: pathlib.Path = PosixPath('.'), module_path: pathlib.Path = PosixPath('.'), time_column_format: str = '%Y-%m-%d', macros: Optional[sqlmesh.utils.UniqueKeyDict[str, Union[sqlmesh.utils.metaprogramming.Executable, sqlmesh.core.macros.macro]]] = None, hooks: Optional[sqlmesh.utils.UniqueKeyDict[str, sqlmesh.core.hooks.hook]] = None, python_env: Optional[Dict[str, sqlmesh.utils.metaprogramming.Executable]] = None, dialect: Optional[str] = None, **kwargs: Any) -> Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]:
870def load_model(
871    expressions: t.List[exp.Expression],
872    *,
873    defaults: t.Optional[t.Dict[str, t.Any]] = None,
874    path: Path = Path(),
875    module_path: Path = Path(),
876    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
877    macros: t.Optional[MacroRegistry] = None,
878    hooks: t.Optional[HookRegistry] = None,
879    python_env: t.Optional[t.Dict[str, Executable]] = None,
880    dialect: t.Optional[str] = None,
881    **kwargs: t.Any,
882) -> Model:
883    """Load a model from a parsed SQLMesh model file.
884
885    Args:
886        expressions: Model, *Statements, Query.
887        defaults: Definition default values.
888        path: An optional path to the model definition file.
889        module_path: The python module path to serialize macros for.
890        time_column_format: The default time column format to use if no model time column is configured.
891        macros: The custom registry of macros. If not provided the default registry will be used.
892        hooks: The custom registry of hooks. If not provided the default registry will be used.
893        python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed
894            from the macro registry.
895        dialect: The default dialect if no model dialect is configured.
896            The format must adhere to Python's strftime codes.
897        kwargs: Additional kwargs to pass to the loader.
898    """
899    if not expressions:
900        raise_config_error("Incomplete model definition, missing MODEL statement", path)
901
902    dialect = dialect or ""
903    meta = expressions[0]
904    query = expressions[-1] if len(expressions) > 1 else None
905    statements = expressions[1:-1]
906
907    if not isinstance(meta, d.Model):
908        raise_config_error(
909            "MODEL statement is required as the first statement in the definition",
910            path,
911        )
912
913    meta_fields: t.Dict[str, t.Any] = {
914        "dialect": dialect,
915        "description": "\n".join(comment.strip() for comment in meta.comments)
916        if meta.comments
917        else None,
918        **{prop.name.lower(): prop.args.get("value") for prop in meta.expressions},
919        **kwargs,
920    }
921
922    name = meta_fields.pop("name", "")
923    if not name:
924        raise_config_error("Model must have a name", path)
925
926    if isinstance(query, d.MacroVar):
927        if python_env is None:
928            raise_config_error("The python environment must be provided for Python models", path)
929            raise
930
931        return create_python_model(
932            name,
933            query.name,
934            python_env,
935            defaults=defaults,
936            path=path,
937            time_column_format=time_column_format,
938            **meta_fields,
939        )
940    elif query is not None:
941        return create_sql_model(
942            name,
943            query,
944            statements=statements,
945            defaults=defaults,
946            path=path,
947            module_path=module_path,
948            time_column_format=time_column_format,
949            macros=macros,
950            hooks=hooks,
951            python_env=python_env,
952            **meta_fields,
953        )
954    else:
955        try:
956            seed_properties = {
957                p.name.lower(): p.args.get("value") for p in meta_fields.pop("kind").expressions
958            }
959            return create_seed_model(
960                name,
961                SeedKind(**seed_properties),
962                defaults=defaults,
963                path=path,
964                **meta_fields,
965            )
966        except Exception:
967            raise_config_error(
968                "The model definition must either have a SELECT query or a valid Seed kind",
969                path,
970            )
971            raise

Load a model from a parsed SQLMesh model file.

Arguments:
  • expressions: Model, *Statements, Query.
  • defaults: Definition default values.
  • path: An optional path to the model definition file.
  • module_path: The python module path to serialize macros for.
  • time_column_format: The default time column format to use if no model time column is configured.
  • macros: The custom registry of macros. If not provided the default registry will be used.
  • hooks: The custom registry of hooks. If not provided the default registry will be used.
  • python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed from the macro registry.
  • dialect: The default dialect if no model dialect is configured. The format must adhere to Python's strftime codes.
  • kwargs: Additional kwargs to pass to the loader.
def create_sql_model( name: str, query: sqlglot.expressions.Expression, *, statements: Optional[List[sqlglot.expressions.Expression]] = None, defaults: Optional[Dict[str, Any]] = None, path: pathlib.Path = PosixPath('.'), module_path: pathlib.Path = PosixPath('.'), time_column_format: str = '%Y-%m-%d', macros: Optional[sqlmesh.utils.UniqueKeyDict[str, Union[sqlmesh.utils.metaprogramming.Executable, sqlmesh.core.macros.macro]]] = None, hooks: Optional[sqlmesh.utils.UniqueKeyDict[str, sqlmesh.core.hooks.hook]] = None, python_env: Optional[Dict[str, sqlmesh.utils.metaprogramming.Executable]] = None, dialect: Optional[str] = None, **kwargs: Any) -> Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]:
 974def create_sql_model(
 975    name: str,
 976    query: exp.Expression,
 977    *,
 978    statements: t.Optional[t.List[exp.Expression]] = None,
 979    defaults: t.Optional[t.Dict[str, t.Any]] = None,
 980    path: Path = Path(),
 981    module_path: Path = Path(),
 982    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
 983    macros: t.Optional[MacroRegistry] = None,
 984    hooks: t.Optional[HookRegistry] = None,
 985    python_env: t.Optional[t.Dict[str, Executable]] = None,
 986    dialect: t.Optional[str] = None,
 987    **kwargs: t.Any,
 988) -> Model:
 989    """Creates a SQL model.
 990
 991    Args:
 992        name: The name of the model, which is of the form [catalog].[db].table.
 993            The catalog and db are optional.
 994        query: The model's logic in a form of a SELECT query.
 995        statements: The list of all SQL statements that are not a query or a model definition.
 996        defaults: Definition default values.
 997        path: An optional path to the model definition file.
 998        module_path: The python module path to serialize macros for.
 999        time_column_format: The default time column format to use if no model time column is configured.
1000        macros: The custom registry of macros. If not provided the default registry will be used.
1001        hooks: The custom registry of hooks. If not provided the default registry will be used.
1002        python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed
1003            from the macro registry.
1004        dialect: The default dialect if no model dialect is configured.
1005            The format must adhere to Python's strftime codes.
1006    """
1007    if not isinstance(query, (exp.Subqueryable, d.Jinja)):
1008        raise_config_error(
1009            "A query is required and must be a SELECT or UNION statement.",
1010            path,
1011        )
1012
1013    if not python_env:
1014        python_env = _python_env(
1015            query,
1016            _extract_hooks(kwargs),
1017            module_path,
1018            macros or macro.get_registry(),
1019            hooks or hook.get_registry(),
1020        )
1021
1022    return _create_model(
1023        SqlModel,
1024        name,
1025        defaults=defaults,
1026        path=path,
1027        time_column_format=time_column_format,
1028        python_env=python_env,
1029        dialect=dialect,
1030        expressions=statements or [],
1031        query=query,
1032        **kwargs,
1033    )

Creates a SQL model.

Arguments:
  • name: The name of the model, which is of the form [catalog].[db].table. The catalog and db are optional.
  • query: The model's logic in a form of a SELECT query.
  • statements: The list of all SQL statements that are not a query or a model definition.
  • defaults: Definition default values.
  • path: An optional path to the model definition file.
  • module_path: The python module path to serialize macros for.
  • time_column_format: The default time column format to use if no model time column is configured.
  • macros: The custom registry of macros. If not provided the default registry will be used.
  • hooks: The custom registry of hooks. If not provided the default registry will be used.
  • python_env: The custom Python environment for hooks/macros. If not provided the environment will be constructed from the macro registry.
  • dialect: The default dialect if no model dialect is configured. The format must adhere to Python's strftime codes.
def create_seed_model( name: str, seed_kind: sqlmesh.core.model.kind.SeedKind, *, defaults: Optional[Dict[str, Any]] = None, path: pathlib.Path = PosixPath('.'), **kwargs: Any) -> Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]:
1036def create_seed_model(
1037    name: str,
1038    seed_kind: SeedKind,
1039    *,
1040    defaults: t.Optional[t.Dict[str, t.Any]] = None,
1041    path: Path = Path(),
1042    **kwargs: t.Any,
1043) -> Model:
1044    """Creates a Seed model.
1045
1046    Args:
1047        name: The name of the model, which is of the form [catalog].[db].table.
1048            The catalog and db are optional.
1049        seed_kind: The information about the location of a seed and other related configuration.
1050        defaults: Definition default values.
1051        path: An optional path to the model definition file.
1052    """
1053    seed_path = Path(seed_kind.path)
1054    if not seed_path.is_absolute():
1055        seed_path = path / seed_path if path.is_dir() else path.parents[0] / seed_path
1056    seed = create_seed(seed_path)
1057    return _create_model(
1058        SeedModel,
1059        name,
1060        defaults=defaults,
1061        path=path,
1062        seed=seed,
1063        kind=seed_kind,
1064        **kwargs,
1065    )

Creates a Seed model.

Arguments:
  • name: The name of the model, which is of the form [catalog].[db].table. The catalog and db are optional.
  • seed_kind: The information about the location of a seed and other related configuration.
  • defaults: Definition default values.
  • path: An optional path to the model definition file.
def create_python_model( name: str, entrypoint: str, python_env: Dict[str, sqlmesh.utils.metaprogramming.Executable], *, defaults: Optional[Dict[str, Any]] = None, path: pathlib.Path = PosixPath('.'), time_column_format: str = '%Y-%m-%d', depends_on: Optional[Set[str]] = None, **kwargs: Any) -> Annotated[Union[sqlmesh.core.model.definition.SqlModel, sqlmesh.core.model.definition.SeedModel, sqlmesh.core.model.definition.PythonModel], FieldInfo(default=PydanticUndefined, discriminator='source_type', extra={})]:
1068def create_python_model(
1069    name: str,
1070    entrypoint: str,
1071    python_env: t.Dict[str, Executable],
1072    *,
1073    defaults: t.Optional[t.Dict[str, t.Any]] = None,
1074    path: Path = Path(),
1075    time_column_format: str = c.DEFAULT_TIME_COLUMN_FORMAT,
1076    depends_on: t.Optional[t.Set[str]] = None,
1077    **kwargs: t.Any,
1078) -> Model:
1079    """Creates a Python model.
1080
1081    Args:
1082        name: The name of the model, which is of the form [catalog].[db].table.
1083            The catalog and db are optional.
1084        entrypoint: The name of a Python function which contains the data fetching / transformation logic.
1085        python_env: The Python environment of all objects referenced by the model implementation.
1086        defaults: Definition default values.
1087        path: An optional path to the model definition file.
1088        time_column_format: The default time column format to use if no model time column is configured.
1089        depends_on: The custom set of model's upstream dependencies.
1090    """
1091    # Find dependencies for python models by parsing code if they are not explicitly defined
1092    depends_on = (
1093        _parse_depends_on(entrypoint, python_env)
1094        if depends_on is None and python_env is not None
1095        else None
1096    )
1097    return _create_model(
1098        PythonModel,
1099        name,
1100        defaults=defaults,
1101        path=path,
1102        time_column_format=time_column_format,
1103        depends_on=depends_on,
1104        entrypoint=entrypoint,
1105        python_env=python_env,
1106        **kwargs,
1107    )

Creates a Python model.

Arguments:
  • name: The name of the model, which is of the form [catalog].[db].table. The catalog and db are optional.
  • entrypoint: The name of a Python function which contains the data fetching / transformation logic.
  • python_env: The Python environment of all objects referenced by the model implementation.
  • defaults: Definition default values.
  • path: An optional path to the model definition file.
  • time_column_format: The default time column format to use if no model time column is configured.
  • depends_on: The custom set of model's upstream dependencies.