Edit on GitHub

sqlmesh.core.engine_adapter.spark

  1from __future__ import annotations
  2
  3import typing as t
  4
  5import pandas as pd
  6from sqlglot import exp
  7
  8from sqlmesh.core.engine_adapter._typing import PySparkDataFrame, PySparkSession
  9from sqlmesh.core.engine_adapter.base_spark import BaseSparkEngineAdapter
 10from sqlmesh.core.engine_adapter.shared import DataObject, DataObjectType
 11from sqlmesh.utils import nullsafe_join
 12
 13if t.TYPE_CHECKING:
 14    from sqlmesh.core._typing import TableName
 15    from sqlmesh.core.engine_adapter._typing import DF, QueryOrDF
 16
 17
 18class SparkEngineAdapter(BaseSparkEngineAdapter):
 19    DIALECT = "spark"
 20
 21    @property
 22    def spark(self) -> PySparkSession:
 23        return self._connection_pool.get().spark
 24
 25    def _ensure_pyspark_df(self, df: DF) -> PySparkDataFrame:
 26        if not isinstance(df, PySparkDataFrame):
 27            return self.spark.createDataFrame(df)
 28        return df
 29
 30    def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame:
 31        return self.fetch_pyspark_df(query).toPandas()
 32
 33    def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame:
 34        return t.cast(PySparkDataFrame, self._fetch_native_df(query))
 35
 36    def _insert_overwrite_by_condition(
 37        self,
 38        table_name: TableName,
 39        query_or_df: QueryOrDF,
 40        where: t.Optional[exp.Condition] = None,
 41        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 42    ) -> None:
 43        if isinstance(query_or_df, (pd.DataFrame, PySparkDataFrame)):
 44            self._insert_pyspark_df(
 45                table_name, self._ensure_pyspark_df(query_or_df), overwrite=True
 46            )
 47        else:
 48            super()._insert_overwrite_by_condition(table_name, query_or_df, where, columns_to_types)
 49
 50    def insert_append(
 51        self,
 52        table_name: TableName,
 53        query_or_df: QueryOrDF,
 54        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 55        contains_json: bool = False,
 56    ) -> None:
 57        if isinstance(query_or_df, PySparkDataFrame):
 58            self._insert_append_pyspark_df(table_name, query_or_df)
 59        else:
 60            super().insert_append(table_name, query_or_df, columns_to_types, contains_json)
 61
 62    def _insert_append_pandas_df(
 63        self,
 64        table_name: TableName,
 65        df: pd.DataFrame,
 66        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 67    ) -> None:
 68        self._insert_pyspark_df(table_name, self._ensure_pyspark_df(df), overwrite=False)
 69
 70    def _insert_append_pyspark_df(
 71        self,
 72        table_name: TableName,
 73        df: PySparkDataFrame,
 74    ) -> None:
 75        self._insert_pyspark_df(table_name, df, overwrite=False)
 76
 77    def _insert_pyspark_df(
 78        self,
 79        table_name: TableName,
 80        df: PySparkDataFrame,
 81        overwrite: bool = False,
 82    ) -> None:
 83        if isinstance(table_name, exp.Table):
 84            table_name = table_name.sql(dialect=self.dialect)
 85
 86        df.select(*self.spark.table(table_name).columns).write.insertInto(  # type: ignore
 87            table_name, overwrite=overwrite
 88        )
 89
 90    def _get_data_objects(
 91        self, schema_name: str, catalog_name: t.Optional[str] = None
 92    ) -> t.List[DataObject]:
 93        target = nullsafe_join(".", catalog_name, schema_name)
 94        df = self.fetch_pyspark_df(f"SHOW TABLE EXTENDED IN {target} LIKE '*'")
 95        return [
 96            DataObject(
 97                catalog=catalog_name,
 98                schema=schema_name,
 99                name=row["tableName"],
100                type=DataObjectType.VIEW
101                if "Type: VIEW" in row["information"]
102                else DataObjectType.TABLE,
103            )
104            for row in df.collect()
105        ]
class SparkEngineAdapter(sqlmesh.core.engine_adapter.base_spark.BaseSparkEngineAdapter):
 19class SparkEngineAdapter(BaseSparkEngineAdapter):
 20    DIALECT = "spark"
 21
 22    @property
 23    def spark(self) -> PySparkSession:
 24        return self._connection_pool.get().spark
 25
 26    def _ensure_pyspark_df(self, df: DF) -> PySparkDataFrame:
 27        if not isinstance(df, PySparkDataFrame):
 28            return self.spark.createDataFrame(df)
 29        return df
 30
 31    def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame:
 32        return self.fetch_pyspark_df(query).toPandas()
 33
 34    def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame:
 35        return t.cast(PySparkDataFrame, self._fetch_native_df(query))
 36
 37    def _insert_overwrite_by_condition(
 38        self,
 39        table_name: TableName,
 40        query_or_df: QueryOrDF,
 41        where: t.Optional[exp.Condition] = None,
 42        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 43    ) -> None:
 44        if isinstance(query_or_df, (pd.DataFrame, PySparkDataFrame)):
 45            self._insert_pyspark_df(
 46                table_name, self._ensure_pyspark_df(query_or_df), overwrite=True
 47            )
 48        else:
 49            super()._insert_overwrite_by_condition(table_name, query_or_df, where, columns_to_types)
 50
 51    def insert_append(
 52        self,
 53        table_name: TableName,
 54        query_or_df: QueryOrDF,
 55        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 56        contains_json: bool = False,
 57    ) -> None:
 58        if isinstance(query_or_df, PySparkDataFrame):
 59            self._insert_append_pyspark_df(table_name, query_or_df)
 60        else:
 61            super().insert_append(table_name, query_or_df, columns_to_types, contains_json)
 62
 63    def _insert_append_pandas_df(
 64        self,
 65        table_name: TableName,
 66        df: pd.DataFrame,
 67        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 68    ) -> None:
 69        self._insert_pyspark_df(table_name, self._ensure_pyspark_df(df), overwrite=False)
 70
 71    def _insert_append_pyspark_df(
 72        self,
 73        table_name: TableName,
 74        df: PySparkDataFrame,
 75    ) -> None:
 76        self._insert_pyspark_df(table_name, df, overwrite=False)
 77
 78    def _insert_pyspark_df(
 79        self,
 80        table_name: TableName,
 81        df: PySparkDataFrame,
 82        overwrite: bool = False,
 83    ) -> None:
 84        if isinstance(table_name, exp.Table):
 85            table_name = table_name.sql(dialect=self.dialect)
 86
 87        df.select(*self.spark.table(table_name).columns).write.insertInto(  # type: ignore
 88            table_name, overwrite=overwrite
 89        )
 90
 91    def _get_data_objects(
 92        self, schema_name: str, catalog_name: t.Optional[str] = None
 93    ) -> t.List[DataObject]:
 94        target = nullsafe_join(".", catalog_name, schema_name)
 95        df = self.fetch_pyspark_df(f"SHOW TABLE EXTENDED IN {target} LIKE '*'")
 96        return [
 97            DataObject(
 98                catalog=catalog_name,
 99                schema=schema_name,
100                name=row["tableName"],
101                type=DataObjectType.VIEW
102                if "Type: VIEW" in row["information"]
103                else DataObjectType.TABLE,
104            )
105            for row in df.collect()
106        ]

Base class wrapping a Database API compliant connection.

The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.

Arguments:
  • connection_factory: a callable which produces a new Database API-compliant connection on every call.
  • dialect: The dialect with which this adapter is associated.
  • multithreaded: Indicates whether this adapter will be used by more than one thread.
def fetchdf( self, query: Union[sqlglot.expressions.Expression, str]) -> pandas.core.frame.DataFrame:
31    def fetchdf(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame:
32        return self.fetch_pyspark_df(query).toPandas()

Fetches a Pandas DataFrame from the cursor

def fetch_pyspark_df( self, query: Union[sqlglot.expressions.Expression, str]) -> pyspark.sql.dataframe.DataFrame:
34    def fetch_pyspark_df(self, query: t.Union[exp.Expression, str]) -> PySparkDataFrame:
35        return t.cast(PySparkDataFrame, self._fetch_native_df(query))

Fetches a PySpark DataFrame from the cursor

def insert_append( self, table_name: <MagicMock id='6006820864'>, query_or_df: <MagicMock id='6007212544'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None, contains_json: bool = False) -> None:
51    def insert_append(
52        self,
53        table_name: TableName,
54        query_or_df: QueryOrDF,
55        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
56        contains_json: bool = False,
57    ) -> None:
58        if isinstance(query_or_df, PySparkDataFrame):
59            self._insert_append_pyspark_df(table_name, query_or_df)
60        else:
61            super().insert_append(table_name, query_or_df, columns_to_types, contains_json)