sqlmesh.core.engine_adapter.spark
1from __future__ import annotations 2 3import typing as t 4 5import pandas as pd 6from sqlglot import exp 7 8from sqlmesh.core.engine_adapter._typing import PySparkDataFrame, PySparkSession 9from sqlmesh.core.engine_adapter.base_spark import BaseSparkEngineAdapter 10from sqlmesh.core.engine_adapter.shared import DataObject, DataObjectType 11from sqlmesh.utils import nullsafe_join 12 13if t.TYPE_CHECKING: 14 from sqlmesh.core._typing import TableName 15 from sqlmesh.core.engine_adapter._typing import DF, QueryOrDF 16 17 18class SparkEngineAdapter(BaseSparkEngineAdapter): 19 DIALECT = "spark" 20 21 @property 22 def spark(self) -> PySparkSession: 23 return self._connection_pool.get().spark 24 25 def _ensure_pyspark_df(self, df: DF) -> PySparkDataFrame: 26 if not isinstance(df, PySparkDataFrame): 27 return self.spark.createDataFrame(df) 28 return df 29 30 def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame: 31 return self.fetch_pyspark_df(query).toPandas() 32 33 def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame: 34 return t.cast(PySparkDataFrame, self._fetch_native_df(query)) 35 36 def _insert_overwrite_by_condition( 37 self, 38 table_name: TableName, 39 query_or_df: QueryOrDF, 40 where: t.Optional[exp.Condition] = None, 41 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 42 ) -> None: 43 if isinstance(query_or_df, (pd.DataFrame, PySparkDataFrame)): 44 self._insert_pyspark_df( 45 table_name, self._ensure_pyspark_df(query_or_df), overwrite=True 46 ) 47 else: 48 super()._insert_overwrite_by_condition(table_name, query_or_df, where, columns_to_types) 49 50 def insert_append( 51 self, 52 table_name: TableName, 53 query_or_df: QueryOrDF, 54 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 55 contains_json: bool = False, 56 ) -> None: 57 if isinstance(query_or_df, PySparkDataFrame): 58 self._insert_append_pyspark_df(table_name, query_or_df) 59 else: 60 super().insert_append(table_name, query_or_df, columns_to_types, contains_json) 61 62 def _insert_append_pandas_df( 63 self, 64 table_name: TableName, 65 df: pd.DataFrame, 66 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 67 ) -> None: 68 self._insert_pyspark_df(table_name, self._ensure_pyspark_df(df), overwrite=False) 69 70 def _insert_append_pyspark_df( 71 self, 72 table_name: TableName, 73 df: PySparkDataFrame, 74 ) -> None: 75 self._insert_pyspark_df(table_name, df, overwrite=False) 76 77 def _insert_pyspark_df( 78 self, 79 table_name: TableName, 80 df: PySparkDataFrame, 81 overwrite: bool = False, 82 ) -> None: 83 if isinstance(table_name, exp.Table): 84 table_name = table_name.sql(dialect=self.dialect) 85 86 df.select(*self.spark.table(table_name).columns).write.insertInto( # type: ignore 87 table_name, overwrite=overwrite 88 ) 89 90 def _get_data_objects( 91 self, schema_name: str, catalog_name: t.Optional[str] = None 92 ) -> t.List[DataObject]: 93 target = nullsafe_join(".", catalog_name, schema_name) 94 df = self.fetch_pyspark_df(f"SHOW TABLE EXTENDED IN {target} LIKE '*'") 95 return [ 96 DataObject( 97 catalog=catalog_name, 98 schema=schema_name, 99 name=row["tableName"], 100 type=DataObjectType.VIEW 101 if "Type: VIEW" in row["information"] 102 else DataObjectType.TABLE, 103 ) 104 for row in df.collect() 105 ]
19class SparkEngineAdapter(BaseSparkEngineAdapter): 20 DIALECT = "spark" 21 22 @property 23 def spark(self) -> PySparkSession: 24 return self._connection_pool.get().spark 25 26 def _ensure_pyspark_df(self, df: DF) -> PySparkDataFrame: 27 if not isinstance(df, PySparkDataFrame): 28 return self.spark.createDataFrame(df) 29 return df 30 31 def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame: 32 return self.fetch_pyspark_df(query).toPandas() 33 34 def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame: 35 return t.cast(PySparkDataFrame, self._fetch_native_df(query)) 36 37 def _insert_overwrite_by_condition( 38 self, 39 table_name: TableName, 40 query_or_df: QueryOrDF, 41 where: t.Optional[exp.Condition] = None, 42 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 43 ) -> None: 44 if isinstance(query_or_df, (pd.DataFrame, PySparkDataFrame)): 45 self._insert_pyspark_df( 46 table_name, self._ensure_pyspark_df(query_or_df), overwrite=True 47 ) 48 else: 49 super()._insert_overwrite_by_condition(table_name, query_or_df, where, columns_to_types) 50 51 def insert_append( 52 self, 53 table_name: TableName, 54 query_or_df: QueryOrDF, 55 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 56 contains_json: bool = False, 57 ) -> None: 58 if isinstance(query_or_df, PySparkDataFrame): 59 self._insert_append_pyspark_df(table_name, query_or_df) 60 else: 61 super().insert_append(table_name, query_or_df, columns_to_types, contains_json) 62 63 def _insert_append_pandas_df( 64 self, 65 table_name: TableName, 66 df: pd.DataFrame, 67 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 68 ) -> None: 69 self._insert_pyspark_df(table_name, self._ensure_pyspark_df(df), overwrite=False) 70 71 def _insert_append_pyspark_df( 72 self, 73 table_name: TableName, 74 df: PySparkDataFrame, 75 ) -> None: 76 self._insert_pyspark_df(table_name, df, overwrite=False) 77 78 def _insert_pyspark_df( 79 self, 80 table_name: TableName, 81 df: PySparkDataFrame, 82 overwrite: bool = False, 83 ) -> None: 84 if isinstance(table_name, exp.Table): 85 table_name = table_name.sql(dialect=self.dialect) 86 87 df.select(*self.spark.table(table_name).columns).write.insertInto( # type: ignore 88 table_name, overwrite=overwrite 89 ) 90 91 def _get_data_objects( 92 self, schema_name: str, catalog_name: t.Optional[str] = None 93 ) -> t.List[DataObject]: 94 target = nullsafe_join(".", catalog_name, schema_name) 95 df = self.fetch_pyspark_df(f"SHOW TABLE EXTENDED IN {target} LIKE '*'") 96 return [ 97 DataObject( 98 catalog=catalog_name, 99 schema=schema_name, 100 name=row["tableName"], 101 type=DataObjectType.VIEW 102 if "Type: VIEW" in row["information"] 103 else DataObjectType.TABLE, 104 ) 105 for row in df.collect() 106 ]
Base class wrapping a Database API compliant connection.
The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.
Arguments:
- connection_factory: a callable which produces a new Database API-compliant connection on every call.
- dialect: The dialect with which this adapter is associated.
- multithreaded: Indicates whether this adapter will be used by more than one thread.
def
fetchdf( self, query: Union[sqlglot.expressions.Expression, str]) -> pandas.core.frame.DataFrame:
31 def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame: 32 return self.fetch_pyspark_df(query).toPandas()
Fetches a Pandas DataFrame from the cursor
def
fetch_pyspark_df( self, query: Union[sqlglot.expressions.Expression, str]) -> pyspark.sql.dataframe.DataFrame:
34 def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame: 35 return t.cast(PySparkDataFrame, self._fetch_native_df(query))
Fetches a PySpark DataFrame from the cursor
def
insert_append( self, table_name: <MagicMock id='6006820864'>, query_or_df: <MagicMock id='6007212544'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None, contains_json: bool = False) -> None:
51 def insert_append( 52 self, 53 table_name: TableName, 54 query_or_df: QueryOrDF, 55 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 56 contains_json: bool = False, 57 ) -> None: 58 if isinstance(query_or_df, PySparkDataFrame): 59 self._insert_append_pyspark_df(table_name, query_or_df) 60 else: 61 super().insert_append(table_name, query_or_df, columns_to_types, contains_json)
Inherited Members
- sqlmesh.core.engine_adapter.base.EngineAdapter
- EngineAdapter
- recycle
- close
- create_index
- create_table
- create_table_like
- drop_table
- create_view
- create_schema
- drop_schema
- drop_view
- columns
- table_exists
- delete_from
- insert_overwrite_by_time_partition
- update_table
- merge
- rename_table
- fetchone
- fetchall
- transaction
- execute