Edit on GitHub

sqlmesh.core.engine_adapter.base_spark

  1from __future__ import annotations
  2
  3import typing as t
  4
  5import pandas as pd
  6from sqlglot import exp, parse_one
  7
  8from sqlmesh.core.dialect import pandas_to_sql
  9from sqlmesh.core.engine_adapter.base import EngineAdapter
 10from sqlmesh.core.engine_adapter.shared import (
 11    DataObject,
 12    DataObjectType,
 13    TransactionType,
 14)
 15from sqlmesh.utils import nullsafe_join
 16from sqlmesh.utils.errors import SQLMeshError
 17
 18if t.TYPE_CHECKING:
 19    from sqlmesh.core._typing import TableName
 20    from sqlmesh.core.engine_adapter._typing import QueryOrDF
 21    from sqlmesh.core.model.meta import IntervalUnit
 22
 23
 24class BaseSparkEngineAdapter(EngineAdapter):
 25    ESCAPE_JSON = True
 26
 27    def replace_query(
 28        self,
 29        table_name: TableName,
 30        query_or_df: QueryOrDF,
 31        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 32    ) -> None:
 33        # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't
 34        # currently check for storage formats we will just do an insert/overwrite.
 35        return self._insert_overwrite_by_condition(
 36            table_name, query_or_df, columns_to_types=columns_to_types
 37        )
 38
 39    def _insert_overwrite_by_condition(
 40        self,
 41        table_name: TableName,
 42        query_or_df: QueryOrDF,
 43        where: t.Optional[exp.Condition] = None,
 44        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 45    ) -> None:
 46        table = exp.to_table(table_name)
 47        if isinstance(query_or_df, pd.DataFrame):
 48            if columns_to_types is None:
 49                raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames")
 50            query_or_df = next(
 51                pandas_to_sql(
 52                    query_or_df,
 53                    alias=table.alias_or_name,
 54                    columns_to_types=columns_to_types,
 55                )
 56            )
 57        self.execute(
 58            exp.Insert(
 59                this=self._insert_into_expression(table_name, columns_to_types),
 60                expression=query_or_df,
 61                overwrite=True,
 62            )
 63        )
 64
 65    def create_state_table(
 66        self,
 67        table_name: str,
 68        columns_to_types: t.Dict[str, exp.DataType],
 69        primary_key: t.Optional[t.Tuple[str, ...]] = None,
 70    ) -> None:
 71        self.create_table(
 72            table_name,
 73            columns_to_types,
 74            partitioned_by=primary_key,
 75        )
 76
 77    def alter_table(
 78        self,
 79        table_name: TableName,
 80        added_columns: t.Dict[str, str],
 81        dropped_columns: t.Sequence[str],
 82    ) -> None:
 83        alter_table = exp.AlterTable(this=exp.to_table(table_name))
 84
 85        if dropped_columns:
 86            drop_columns = exp.Drop(
 87                this=exp.Schema(
 88                    expressions=[exp.to_identifier(column_name) for column_name in dropped_columns]
 89                ),
 90                kind="COLUMNS",
 91            )
 92            alter_table.set("actions", [drop_columns])
 93            self.execute(alter_table)
 94
 95        if added_columns:
 96            add_columns = exp.Schema(
 97                expressions=[
 98                    exp.ColumnDef(
 99                        this=exp.to_identifier(column_name),
100                        kind=parse_one(column_type, into=exp.DataType),  # type: ignore
101                    )
102                    for column_name, column_type in added_columns.items()
103                ],
104            )
105            alter_table.set("actions", [add_columns])
106            self.execute(alter_table)
107
108    def _create_table_properties(
109        self,
110        storage_format: t.Optional[str] = None,
111        partitioned_by: t.Optional[t.List[str]] = None,
112        partition_interval_unit: t.Optional[IntervalUnit] = None,
113    ) -> t.Optional[exp.Properties]:
114        format_property = None
115        partition_columns_property = None
116        if storage_format:
117            format_property = exp.TableFormatProperty(this=exp.Var(this=storage_format))
118        if partitioned_by:
119            partition_columns_property = exp.PartitionedByProperty(
120                this=exp.Schema(
121                    expressions=[exp.to_identifier(column) for column in partitioned_by]
122                ),
123            )
124        return exp.Properties(
125            expressions=[
126                table_property
127                for table_property in [format_property, partition_columns_property]
128                if table_property
129            ]
130        )
131
132    def supports_transactions(self, transaction_type: TransactionType) -> bool:
133        return False
134
135    def _get_data_objects(
136        self, schema_name: str, catalog_name: t.Optional[str] = None
137    ) -> t.List[DataObject]:
138        """
139        Returns all the data objects that exist in the given schema and optionally catalog.
140        """
141        target = nullsafe_join(".", catalog_name, schema_name)
142        query = f"SHOW TABLE EXTENDED IN {target} LIKE '*'"
143        df = self.fetchdf(query)
144        return [
145            DataObject(
146                catalog=catalog_name,
147                schema=schema_name,
148                name=row.tableName,  # type: ignore
149                type=DataObjectType.from_str(
150                    "VIEW" if "Type: VIEW" in row.information else "TABLE"  # type: ignore
151                ),
152            )
153            for row in df.itertuples()
154        ]
class BaseSparkEngineAdapter(sqlmesh.core.engine_adapter.base.EngineAdapter):
 25class BaseSparkEngineAdapter(EngineAdapter):
 26    ESCAPE_JSON = True
 27
 28    def replace_query(
 29        self,
 30        table_name: TableName,
 31        query_or_df: QueryOrDF,
 32        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 33    ) -> None:
 34        # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't
 35        # currently check for storage formats we will just do an insert/overwrite.
 36        return self._insert_overwrite_by_condition(
 37            table_name, query_or_df, columns_to_types=columns_to_types
 38        )
 39
 40    def _insert_overwrite_by_condition(
 41        self,
 42        table_name: TableName,
 43        query_or_df: QueryOrDF,
 44        where: t.Optional[exp.Condition] = None,
 45        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 46    ) -> None:
 47        table = exp.to_table(table_name)
 48        if isinstance(query_or_df, pd.DataFrame):
 49            if columns_to_types is None:
 50                raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames")
 51            query_or_df = next(
 52                pandas_to_sql(
 53                    query_or_df,
 54                    alias=table.alias_or_name,
 55                    columns_to_types=columns_to_types,
 56                )
 57            )
 58        self.execute(
 59            exp.Insert(
 60                this=self._insert_into_expression(table_name, columns_to_types),
 61                expression=query_or_df,
 62                overwrite=True,
 63            )
 64        )
 65
 66    def create_state_table(
 67        self,
 68        table_name: str,
 69        columns_to_types: t.Dict[str, exp.DataType],
 70        primary_key: t.Optional[t.Tuple[str, ...]] = None,
 71    ) -> None:
 72        self.create_table(
 73            table_name,
 74            columns_to_types,
 75            partitioned_by=primary_key,
 76        )
 77
 78    def alter_table(
 79        self,
 80        table_name: TableName,
 81        added_columns: t.Dict[str, str],
 82        dropped_columns: t.Sequence[str],
 83    ) -> None:
 84        alter_table = exp.AlterTable(this=exp.to_table(table_name))
 85
 86        if dropped_columns:
 87            drop_columns = exp.Drop(
 88                this=exp.Schema(
 89                    expressions=[exp.to_identifier(column_name) for column_name in dropped_columns]
 90                ),
 91                kind="COLUMNS",
 92            )
 93            alter_table.set("actions", [drop_columns])
 94            self.execute(alter_table)
 95
 96        if added_columns:
 97            add_columns = exp.Schema(
 98                expressions=[
 99                    exp.ColumnDef(
100                        this=exp.to_identifier(column_name),
101                        kind=parse_one(column_type, into=exp.DataType),  # type: ignore
102                    )
103                    for column_name, column_type in added_columns.items()
104                ],
105            )
106            alter_table.set("actions", [add_columns])
107            self.execute(alter_table)
108
109    def _create_table_properties(
110        self,
111        storage_format: t.Optional[str] = None,
112        partitioned_by: t.Optional[t.List[str]] = None,
113        partition_interval_unit: t.Optional[IntervalUnit] = None,
114    ) -> t.Optional[exp.Properties]:
115        format_property = None
116        partition_columns_property = None
117        if storage_format:
118            format_property = exp.TableFormatProperty(this=exp.Var(this=storage_format))
119        if partitioned_by:
120            partition_columns_property = exp.PartitionedByProperty(
121                this=exp.Schema(
122                    expressions=[exp.to_identifier(column) for column in partitioned_by]
123                ),
124            )
125        return exp.Properties(
126            expressions=[
127                table_property
128                for table_property in [format_property, partition_columns_property]
129                if table_property
130            ]
131        )
132
133    def supports_transactions(self, transaction_type: TransactionType) -> bool:
134        return False
135
136    def _get_data_objects(
137        self, schema_name: str, catalog_name: t.Optional[str] = None
138    ) -> t.List[DataObject]:
139        """
140        Returns all the data objects that exist in the given schema and optionally catalog.
141        """
142        target = nullsafe_join(".", catalog_name, schema_name)
143        query = f"SHOW TABLE EXTENDED IN {target} LIKE '*'"
144        df = self.fetchdf(query)
145        return [
146            DataObject(
147                catalog=catalog_name,
148                schema=schema_name,
149                name=row.tableName,  # type: ignore
150                type=DataObjectType.from_str(
151                    "VIEW" if "Type: VIEW" in row.information else "TABLE"  # type: ignore
152                ),
153            )
154            for row in df.itertuples()
155        ]

Base class wrapping a Database API compliant connection.

The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.

Arguments:
  • connection_factory: a callable which produces a new Database API-compliant connection on every call.
  • dialect: The dialect with which this adapter is associated.
  • multithreaded: Indicates whether this adapter will be used by more than one thread.
def replace_query( self, table_name: <MagicMock id='5986678576'>, query_or_df: <MagicMock id='5986892144'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None) -> None:
28    def replace_query(
29        self,
30        table_name: TableName,
31        query_or_df: QueryOrDF,
32        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
33    ) -> None:
34        # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't
35        # currently check for storage formats we will just do an insert/overwrite.
36        return self._insert_overwrite_by_condition(
37            table_name, query_or_df, columns_to_types=columns_to_types
38        )

Replaces an existing table with a query.

For partition based engines (hive, spark), insert override is used. For other systems, create or replace is used.

Arguments:
  • table_name: The name of the table (eg. prod.table)
  • query_or_df: The SQL query to run or a dataframe.
  • columns_to_types: Only used if a dataframe is provided. A mapping between the column name and its data type. Expected to be ordered to match the order of values in the dataframe.
def create_state_table( self, table_name: str, columns_to_types: Dict[str, sqlglot.expressions.DataType], primary_key: Optional[Tuple[str, ...]] = None) -> None:
66    def create_state_table(
67        self,
68        table_name: str,
69        columns_to_types: t.Dict[str, exp.DataType],
70        primary_key: t.Optional[t.Tuple[str, ...]] = None,
71    ) -> None:
72        self.create_table(
73            table_name,
74            columns_to_types,
75            partitioned_by=primary_key,
76        )

Create a table to store SQLMesh internal state.

Arguments:
  • table_name: The name of the table to create. Can be fully qualified or just table name.
  • columns_to_types: A mapping between the column name and its data type.
  • primary_key: Determines the table primary key.
def alter_table( self, table_name: <MagicMock id='5987242896'>, added_columns: Dict[str, str], dropped_columns: Sequence[str]) -> None:
 78    def alter_table(
 79        self,
 80        table_name: TableName,
 81        added_columns: t.Dict[str, str],
 82        dropped_columns: t.Sequence[str],
 83    ) -> None:
 84        alter_table = exp.AlterTable(this=exp.to_table(table_name))
 85
 86        if dropped_columns:
 87            drop_columns = exp.Drop(
 88                this=exp.Schema(
 89                    expressions=[exp.to_identifier(column_name) for column_name in dropped_columns]
 90                ),
 91                kind="COLUMNS",
 92            )
 93            alter_table.set("actions", [drop_columns])
 94            self.execute(alter_table)
 95
 96        if added_columns:
 97            add_columns = exp.Schema(
 98                expressions=[
 99                    exp.ColumnDef(
100                        this=exp.to_identifier(column_name),
101                        kind=parse_one(column_type, into=exp.DataType),  # type: ignore
102                    )
103                    for column_name, column_type in added_columns.items()
104                ],
105            )
106            alter_table.set("actions", [add_columns])
107            self.execute(alter_table)
def supports_transactions( self, transaction_type: sqlmesh.core.engine_adapter.shared.TransactionType) -> bool:
133    def supports_transactions(self, transaction_type: TransactionType) -> bool:
134        return False

Whether or not the engine adapter supports transactions for the given transaction type.