sqlmesh.core.engine_adapter.base_spark
1from __future__ import annotations 2 3import typing as t 4 5import pandas as pd 6from sqlglot import exp, parse_one 7 8from sqlmesh.core.dialect import pandas_to_sql 9from sqlmesh.core.engine_adapter.base import EngineAdapter 10from sqlmesh.core.engine_adapter.shared import ( 11 DataObject, 12 DataObjectType, 13 TransactionType, 14) 15from sqlmesh.utils import nullsafe_join 16from sqlmesh.utils.errors import SQLMeshError 17 18if t.TYPE_CHECKING: 19 from sqlmesh.core._typing import TableName 20 from sqlmesh.core.engine_adapter._typing import QueryOrDF 21 from sqlmesh.core.model.meta import IntervalUnit 22 23 24class BaseSparkEngineAdapter(EngineAdapter): 25 ESCAPE_JSON = True 26 27 def replace_query( 28 self, 29 table_name: TableName, 30 query_or_df: QueryOrDF, 31 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 32 ) -> None: 33 # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't 34 # currently check for storage formats we will just do an insert/overwrite. 35 return self._insert_overwrite_by_condition( 36 table_name, query_or_df, columns_to_types=columns_to_types 37 ) 38 39 def _insert_overwrite_by_condition( 40 self, 41 table_name: TableName, 42 query_or_df: QueryOrDF, 43 where: t.Optional[exp.Condition] = None, 44 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 45 ) -> None: 46 table = exp.to_table(table_name) 47 if isinstance(query_or_df, pd.DataFrame): 48 if columns_to_types is None: 49 raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames") 50 query_or_df = next( 51 pandas_to_sql( 52 query_or_df, 53 alias=table.alias_or_name, 54 columns_to_types=columns_to_types, 55 ) 56 ) 57 self.execute( 58 exp.Insert( 59 this=self._insert_into_expression(table_name, columns_to_types), 60 expression=query_or_df, 61 overwrite=True, 62 ) 63 ) 64 65 def create_state_table( 66 self, 67 table_name: str, 68 columns_to_types: t.Dict[str, exp.DataType], 69 primary_key: t.Optional[t.Tuple[str, ...]] = None, 70 ) -> None: 71 self.create_table( 72 table_name, 73 columns_to_types, 74 partitioned_by=primary_key, 75 ) 76 77 def alter_table( 78 self, 79 table_name: TableName, 80 added_columns: t.Dict[str, str], 81 dropped_columns: t.Sequence[str], 82 ) -> None: 83 alter_table = exp.AlterTable(this=exp.to_table(table_name)) 84 85 if dropped_columns: 86 drop_columns = exp.Drop( 87 this=exp.Schema( 88 expressions=[exp.to_identifier(column_name) for column_name in dropped_columns] 89 ), 90 kind="COLUMNS", 91 ) 92 alter_table.set("actions", [drop_columns]) 93 self.execute(alter_table) 94 95 if added_columns: 96 add_columns = exp.Schema( 97 expressions=[ 98 exp.ColumnDef( 99 this=exp.to_identifier(column_name), 100 kind=parse_one(column_type, into=exp.DataType), # type: ignore 101 ) 102 for column_name, column_type in added_columns.items() 103 ], 104 ) 105 alter_table.set("actions", [add_columns]) 106 self.execute(alter_table) 107 108 def _create_table_properties( 109 self, 110 storage_format: t.Optional[str] = None, 111 partitioned_by: t.Optional[t.List[str]] = None, 112 partition_interval_unit: t.Optional[IntervalUnit] = None, 113 ) -> t.Optional[exp.Properties]: 114 format_property = None 115 partition_columns_property = None 116 if storage_format: 117 format_property = exp.TableFormatProperty(this=exp.Var(this=storage_format)) 118 if partitioned_by: 119 partition_columns_property = exp.PartitionedByProperty( 120 this=exp.Schema( 121 expressions=[exp.to_identifier(column) for column in partitioned_by] 122 ), 123 ) 124 return exp.Properties( 125 expressions=[ 126 table_property 127 for table_property in [format_property, partition_columns_property] 128 if table_property 129 ] 130 ) 131 132 def supports_transactions(self, transaction_type: TransactionType) -> bool: 133 return False 134 135 def _get_data_objects( 136 self, schema_name: str, catalog_name: t.Optional[str] = None 137 ) -> t.List[DataObject]: 138 """ 139 Returns all the data objects that exist in the given schema and optionally catalog. 140 """ 141 target = nullsafe_join(".", catalog_name, schema_name) 142 query = f"SHOW TABLE EXTENDED IN {target} LIKE '*'" 143 df = self.fetchdf(query) 144 return [ 145 DataObject( 146 catalog=catalog_name, 147 schema=schema_name, 148 name=row.tableName, # type: ignore 149 type=DataObjectType.from_str( 150 "VIEW" if "Type: VIEW" in row.information else "TABLE" # type: ignore 151 ), 152 ) 153 for row in df.itertuples() 154 ]
25class BaseSparkEngineAdapter(EngineAdapter): 26 ESCAPE_JSON = True 27 28 def replace_query( 29 self, 30 table_name: TableName, 31 query_or_df: QueryOrDF, 32 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 33 ) -> None: 34 # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't 35 # currently check for storage formats we will just do an insert/overwrite. 36 return self._insert_overwrite_by_condition( 37 table_name, query_or_df, columns_to_types=columns_to_types 38 ) 39 40 def _insert_overwrite_by_condition( 41 self, 42 table_name: TableName, 43 query_or_df: QueryOrDF, 44 where: t.Optional[exp.Condition] = None, 45 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 46 ) -> None: 47 table = exp.to_table(table_name) 48 if isinstance(query_or_df, pd.DataFrame): 49 if columns_to_types is None: 50 raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames") 51 query_or_df = next( 52 pandas_to_sql( 53 query_or_df, 54 alias=table.alias_or_name, 55 columns_to_types=columns_to_types, 56 ) 57 ) 58 self.execute( 59 exp.Insert( 60 this=self._insert_into_expression(table_name, columns_to_types), 61 expression=query_or_df, 62 overwrite=True, 63 ) 64 ) 65 66 def create_state_table( 67 self, 68 table_name: str, 69 columns_to_types: t.Dict[str, exp.DataType], 70 primary_key: t.Optional[t.Tuple[str, ...]] = None, 71 ) -> None: 72 self.create_table( 73 table_name, 74 columns_to_types, 75 partitioned_by=primary_key, 76 ) 77 78 def alter_table( 79 self, 80 table_name: TableName, 81 added_columns: t.Dict[str, str], 82 dropped_columns: t.Sequence[str], 83 ) -> None: 84 alter_table = exp.AlterTable(this=exp.to_table(table_name)) 85 86 if dropped_columns: 87 drop_columns = exp.Drop( 88 this=exp.Schema( 89 expressions=[exp.to_identifier(column_name) for column_name in dropped_columns] 90 ), 91 kind="COLUMNS", 92 ) 93 alter_table.set("actions", [drop_columns]) 94 self.execute(alter_table) 95 96 if added_columns: 97 add_columns = exp.Schema( 98 expressions=[ 99 exp.ColumnDef( 100 this=exp.to_identifier(column_name), 101 kind=parse_one(column_type, into=exp.DataType), # type: ignore 102 ) 103 for column_name, column_type in added_columns.items() 104 ], 105 ) 106 alter_table.set("actions", [add_columns]) 107 self.execute(alter_table) 108 109 def _create_table_properties( 110 self, 111 storage_format: t.Optional[str] = None, 112 partitioned_by: t.Optional[t.List[str]] = None, 113 partition_interval_unit: t.Optional[IntervalUnit] = None, 114 ) -> t.Optional[exp.Properties]: 115 format_property = None 116 partition_columns_property = None 117 if storage_format: 118 format_property = exp.TableFormatProperty(this=exp.Var(this=storage_format)) 119 if partitioned_by: 120 partition_columns_property = exp.PartitionedByProperty( 121 this=exp.Schema( 122 expressions=[exp.to_identifier(column) for column in partitioned_by] 123 ), 124 ) 125 return exp.Properties( 126 expressions=[ 127 table_property 128 for table_property in [format_property, partition_columns_property] 129 if table_property 130 ] 131 ) 132 133 def supports_transactions(self, transaction_type: TransactionType) -> bool: 134 return False 135 136 def _get_data_objects( 137 self, schema_name: str, catalog_name: t.Optional[str] = None 138 ) -> t.List[DataObject]: 139 """ 140 Returns all the data objects that exist in the given schema and optionally catalog. 141 """ 142 target = nullsafe_join(".", catalog_name, schema_name) 143 query = f"SHOW TABLE EXTENDED IN {target} LIKE '*'" 144 df = self.fetchdf(query) 145 return [ 146 DataObject( 147 catalog=catalog_name, 148 schema=schema_name, 149 name=row.tableName, # type: ignore 150 type=DataObjectType.from_str( 151 "VIEW" if "Type: VIEW" in row.information else "TABLE" # type: ignore 152 ), 153 ) 154 for row in df.itertuples() 155 ]
Base class wrapping a Database API compliant connection.
The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.
Arguments:
- connection_factory: a callable which produces a new Database API-compliant connection on every call.
- dialect: The dialect with which this adapter is associated.
- multithreaded: Indicates whether this adapter will be used by more than one thread.
def
replace_query( self, table_name: <MagicMock id='5986678576'>, query_or_df: <MagicMock id='5986892144'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None) -> None:
28 def replace_query( 29 self, 30 table_name: TableName, 31 query_or_df: QueryOrDF, 32 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 33 ) -> None: 34 # Note: Some storage formats (like Delta and Iceberg) support REPLACE TABLE but since we don't 35 # currently check for storage formats we will just do an insert/overwrite. 36 return self._insert_overwrite_by_condition( 37 table_name, query_or_df, columns_to_types=columns_to_types 38 )
Replaces an existing table with a query.
For partition based engines (hive, spark), insert override is used. For other systems, create or replace is used.
Arguments:
- table_name: The name of the table (eg. prod.table)
- query_or_df: The SQL query to run or a dataframe.
- columns_to_types: Only used if a dataframe is provided. A mapping between the column name and its data type. Expected to be ordered to match the order of values in the dataframe.
def
create_state_table( self, table_name: str, columns_to_types: Dict[str, sqlglot.expressions.DataType], primary_key: Optional[Tuple[str, ...]] = None) -> None:
66 def create_state_table( 67 self, 68 table_name: str, 69 columns_to_types: t.Dict[str, exp.DataType], 70 primary_key: t.Optional[t.Tuple[str, ...]] = None, 71 ) -> None: 72 self.create_table( 73 table_name, 74 columns_to_types, 75 partitioned_by=primary_key, 76 )
Create a table to store SQLMesh internal state.
Arguments:
- table_name: The name of the table to create. Can be fully qualified or just table name.
- columns_to_types: A mapping between the column name and its data type.
- primary_key: Determines the table primary key.
def
alter_table( self, table_name: <MagicMock id='5987242896'>, added_columns: Dict[str, str], dropped_columns: Sequence[str]) -> None:
78 def alter_table( 79 self, 80 table_name: TableName, 81 added_columns: t.Dict[str, str], 82 dropped_columns: t.Sequence[str], 83 ) -> None: 84 alter_table = exp.AlterTable(this=exp.to_table(table_name)) 85 86 if dropped_columns: 87 drop_columns = exp.Drop( 88 this=exp.Schema( 89 expressions=[exp.to_identifier(column_name) for column_name in dropped_columns] 90 ), 91 kind="COLUMNS", 92 ) 93 alter_table.set("actions", [drop_columns]) 94 self.execute(alter_table) 95 96 if added_columns: 97 add_columns = exp.Schema( 98 expressions=[ 99 exp.ColumnDef( 100 this=exp.to_identifier(column_name), 101 kind=parse_one(column_type, into=exp.DataType), # type: ignore 102 ) 103 for column_name, column_type in added_columns.items() 104 ], 105 ) 106 alter_table.set("actions", [add_columns]) 107 self.execute(alter_table)
def
supports_transactions( self, transaction_type: sqlmesh.core.engine_adapter.shared.TransactionType) -> bool:
Whether or not the engine adapter supports transactions for the given transaction type.
Inherited Members
- sqlmesh.core.engine_adapter.base.EngineAdapter
- EngineAdapter
- recycle
- close
- create_index
- create_table
- create_table_like
- drop_table
- create_view
- create_schema
- drop_schema
- drop_view
- columns
- table_exists
- delete_from
- insert_append
- insert_overwrite_by_time_partition
- update_table
- merge
- rename_table
- fetchone
- fetchall
- fetchdf
- fetch_pyspark_df
- transaction
- execute