Edit on GitHub

sqlmesh.core.engine_adapter.bigquery

  1from __future__ import annotations
  2
  3import typing as t
  4import uuid
  5
  6import pandas as pd
  7from sqlglot import exp
  8from sqlglot.transforms import remove_precision_parameterized_types
  9
 10from sqlmesh.core.engine_adapter._typing import DF_TYPES, Query
 11from sqlmesh.core.engine_adapter.base import EngineAdapter
 12from sqlmesh.core.engine_adapter.shared import (
 13    DataObject,
 14    DataObjectType,
 15    TransactionType,
 16)
 17from sqlmesh.core.model.meta import IntervalUnit
 18from sqlmesh.utils.date import to_datetime
 19from sqlmesh.utils.errors import SQLMeshError
 20
 21if t.TYPE_CHECKING:
 22    from google.cloud.bigquery.client import Client as BigQueryClient
 23    from google.cloud.bigquery.client import Connection as BigQueryConnection
 24    from google.cloud.bigquery.job.base import _AsyncJob as BigQueryQueryResult
 25    from google.cloud.bigquery.table import Table as BigQueryTable
 26
 27    from sqlmesh.core._typing import TableName
 28    from sqlmesh.core.engine_adapter._typing import DF, QueryOrDF
 29
 30
 31class BigQueryEngineAdapter(EngineAdapter):
 32    DIALECT = "bigquery"
 33    DEFAULT_BATCH_SIZE = 1000
 34    ESCAPE_JSON = True
 35
 36    @property
 37    def client(self) -> BigQueryClient:
 38        return self.cursor.connection._client
 39
 40    @property
 41    def connection(self) -> BigQueryConnection:
 42        return self.cursor.connection
 43
 44    def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None:
 45        """Create a schema from a name or qualified table name."""
 46        from google.cloud.bigquery.dbapi.exceptions import DatabaseError
 47
 48        try:
 49            super().create_schema(schema_name, ignore_if_exists=ignore_if_exists)
 50        except DatabaseError as e:
 51            for arg in e.args:
 52                if ignore_if_exists and "Already Exists: " in arg.message:
 53                    return
 54            raise e
 55
 56    def columns(self, table_name: TableName) -> t.Dict[str, str]:
 57        """Fetches column names and types for the target table."""
 58        table = self._get_table(table_name)
 59        return {field.name: field.field_type for field in table.schema}
 60
 61    def __load_pandas_to_temp_table(
 62        self,
 63        table: TableName,
 64        df: pd.DataFrame,
 65        columns_to_types: t.Dict[str, exp.DataType],
 66    ) -> t.Tuple[BigQueryQueryResult, str]:
 67        """
 68        Loads a pandas dataframe into a temporary table in BigQuery. Returns the result of the load and the name of the
 69        temporary table. The temporary table will be deleted after 3 hours.
 70        """
 71        from google.cloud import bigquery
 72
 73        table = exp.to_table(table)
 74        precisionless_col_to_types = {
 75            col_name: remove_precision_parameterized_types(col_type)
 76            for col_name, col_type in columns_to_types.items()
 77        }
 78        temp_table_name = f"{self.client.project}.{table.db}.__temp_{table.name}_{uuid.uuid4().hex}"
 79        schema = [
 80            bigquery.SchemaField(col_name, col_type.sql(dialect=self.dialect))
 81            for col_name, col_type in precisionless_col_to_types.items()
 82        ]
 83        bq_table = bigquery.Table(table_ref=temp_table_name, schema=schema)
 84        bq_table.expires = to_datetime("in 3 hours")
 85        self.client.create_table(bq_table)
 86        result = self.client.load_table_from_dataframe(df, bq_table).result()
 87        if result.errors:
 88            raise SQLMeshError(result.errors)
 89        return result, temp_table_name
 90
 91    def _insert_overwrite_by_condition(
 92        self,
 93        table_name: TableName,
 94        query_or_df: QueryOrDF,
 95        where: t.Optional[exp.Condition] = None,
 96        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 97    ) -> None:
 98        """
 99        Bigquery does not directly support `INSERT OVERWRITE` but it does support `MERGE` with a `False`
100        condition and delete that mimics an `INSERT OVERWRITE`. Based on documentation this should have the
101        same runtime performance as `INSERT OVERWRITE`.
102
103        If a Pandas DataFrame is provided, it will be loaded into a temporary table and then merged with the
104        target table. This temporary table is deleted after the merge is complete or after it's expiration time has
105        passed.
106        """
107        table = exp.to_table(table_name)
108        query: t.Union[Query, exp.Select]
109        is_pandas = isinstance(query_or_df, pd.DataFrame)
110        temp_table_name: t.Optional[str] = None
111        if isinstance(query_or_df, DF_TYPES):
112            if not is_pandas:
113                raise SQLMeshError("BigQuery only supports pandas DataFrames")
114            if columns_to_types is None:
115                raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames")
116            if table.db is None:
117                raise SQLMeshError("table_name must be qualified when using Pandas DataFrames")
118            query_or_df = t.cast(pd.DataFrame, query_or_df)
119            result, temp_table_name = self.__load_pandas_to_temp_table(
120                table, query_or_df, columns_to_types
121            )
122            if result.errors:
123                raise SQLMeshError(result.errors)
124            query = exp.select(*columns_to_types).from_(exp.to_table(temp_table_name))
125        else:
126            query = t.cast(Query, query_or_df)
127        columns = [
128            exp.to_column(col)
129            for col in (columns_to_types or [col.alias_or_name for col in query.expressions])
130        ]
131        when_not_matched_by_source = exp.When(
132            matched=False,
133            source=True,
134            condition=where,
135            then=exp.Delete(),
136        )
137        when_not_matched_by_target = exp.When(
138            matched=False,
139            source=False,
140            then=exp.Insert(
141                this=exp.Tuple(expressions=columns),
142                expression=exp.Tuple(expressions=columns),
143            ),
144        )
145        self._merge(
146            target_table=table,
147            source_table=query,
148            on=exp.false(),
149            match_expressions=[when_not_matched_by_source, when_not_matched_by_target],
150        )
151        if is_pandas:
152            assert temp_table_name is not None
153            self.drop_table(temp_table_name)
154
155    def table_exists(self, table_name: TableName) -> bool:
156        from google.cloud.exceptions import NotFound
157
158        try:
159            self._get_table(table_name)
160            return True
161        except NotFound:
162            return False
163
164    def _get_table(self, table_name: TableName) -> BigQueryTable:
165        """
166        Returns a BigQueryTable object for the given table name.
167
168        Raises: `google.cloud.exceptions.NotFound` if the table does not exist.
169        """
170        if isinstance(table_name, exp.Table):
171            table_name = table_name.sql(dialect=self.dialect)
172
173        return self.client.get_table(table_name)
174
175    def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> DF:
176        self.execute(query)
177        return self.cursor._query_job.to_dataframe()
178
179    def _create_table_properties(
180        self,
181        storage_format: t.Optional[str] = None,
182        partitioned_by: t.Optional[t.List[str]] = None,
183        partition_interval_unit: t.Optional[IntervalUnit] = None,
184    ) -> t.Optional[exp.Properties]:
185        if not partitioned_by:
186            return None
187        if partition_interval_unit is None:
188            raise SQLMeshError("partition_interval_unit is required when partitioning a table")
189        if partition_interval_unit == IntervalUnit.MINUTE:
190            raise SQLMeshError("BigQuery does not support partitioning by minute")
191        if len(partitioned_by) > 1:
192            raise SQLMeshError("BigQuery only supports partitioning by a single column")
193        partition_col = exp.to_column(partitioned_by[0])
194        this: t.Union[exp.Func, exp.Column]
195        if partition_interval_unit == IntervalUnit.HOUR:
196            this = exp.func(
197                "TIMESTAMP_TRUNC",
198                partition_col,
199                exp.var(IntervalUnit.HOUR.value.upper()),
200                dialect=self.dialect,
201            )
202        else:
203            this = partition_col
204
205        partition_columns_property = exp.PartitionedByProperty(this=this)
206        return exp.Properties(expressions=[partition_columns_property])
207
208    def create_state_table(
209        self,
210        table_name: str,
211        columns_to_types: t.Dict[str, exp.DataType],
212        primary_key: t.Optional[t.Tuple[str, ...]] = None,
213    ) -> None:
214        self.create_table(
215            table_name,
216            columns_to_types,
217        )
218
219    def supports_transactions(self, transaction_type: TransactionType) -> bool:
220        return False
221
222    def _get_data_objects(
223        self, schema_name: str, catalog_name: t.Optional[str] = None
224    ) -> t.List[DataObject]:
225        """
226        Returns all the data objects that exist in the given schema and optionally catalog.
227        """
228        from google.cloud.bigquery import DatasetReference
229
230        dataset_ref = DatasetReference(
231            project=catalog_name or self.client.project, dataset_id=schema_name
232        )
233        all_tables = self.client.list_tables(dataset_ref)
234        return [
235            DataObject(
236                catalog=table.project,
237                schema=table.dataset_id,
238                name=table.table_id,
239                type=DataObjectType.from_str(table.table_type),
240            )
241            for table in all_tables
242        ]
class BigQueryEngineAdapter(sqlmesh.core.engine_adapter.base.EngineAdapter):
 32class BigQueryEngineAdapter(EngineAdapter):
 33    DIALECT = "bigquery"
 34    DEFAULT_BATCH_SIZE = 1000
 35    ESCAPE_JSON = True
 36
 37    @property
 38    def client(self) -> BigQueryClient:
 39        return self.cursor.connection._client
 40
 41    @property
 42    def connection(self) -> BigQueryConnection:
 43        return self.cursor.connection
 44
 45    def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None:
 46        """Create a schema from a name or qualified table name."""
 47        from google.cloud.bigquery.dbapi.exceptions import DatabaseError
 48
 49        try:
 50            super().create_schema(schema_name, ignore_if_exists=ignore_if_exists)
 51        except DatabaseError as e:
 52            for arg in e.args:
 53                if ignore_if_exists and "Already Exists: " in arg.message:
 54                    return
 55            raise e
 56
 57    def columns(self, table_name: TableName) -> t.Dict[str, str]:
 58        """Fetches column names and types for the target table."""
 59        table = self._get_table(table_name)
 60        return {field.name: field.field_type for field in table.schema}
 61
 62    def __load_pandas_to_temp_table(
 63        self,
 64        table: TableName,
 65        df: pd.DataFrame,
 66        columns_to_types: t.Dict[str, exp.DataType],
 67    ) -> t.Tuple[BigQueryQueryResult, str]:
 68        """
 69        Loads a pandas dataframe into a temporary table in BigQuery. Returns the result of the load and the name of the
 70        temporary table. The temporary table will be deleted after 3 hours.
 71        """
 72        from google.cloud import bigquery
 73
 74        table = exp.to_table(table)
 75        precisionless_col_to_types = {
 76            col_name: remove_precision_parameterized_types(col_type)
 77            for col_name, col_type in columns_to_types.items()
 78        }
 79        temp_table_name = f"{self.client.project}.{table.db}.__temp_{table.name}_{uuid.uuid4().hex}"
 80        schema = [
 81            bigquery.SchemaField(col_name, col_type.sql(dialect=self.dialect))
 82            for col_name, col_type in precisionless_col_to_types.items()
 83        ]
 84        bq_table = bigquery.Table(table_ref=temp_table_name, schema=schema)
 85        bq_table.expires = to_datetime("in 3 hours")
 86        self.client.create_table(bq_table)
 87        result = self.client.load_table_from_dataframe(df, bq_table).result()
 88        if result.errors:
 89            raise SQLMeshError(result.errors)
 90        return result, temp_table_name
 91
 92    def _insert_overwrite_by_condition(
 93        self,
 94        table_name: TableName,
 95        query_or_df: QueryOrDF,
 96        where: t.Optional[exp.Condition] = None,
 97        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 98    ) -> None:
 99        """
100        Bigquery does not directly support `INSERT OVERWRITE` but it does support `MERGE` with a `False`
101        condition and delete that mimics an `INSERT OVERWRITE`. Based on documentation this should have the
102        same runtime performance as `INSERT OVERWRITE`.
103
104        If a Pandas DataFrame is provided, it will be loaded into a temporary table and then merged with the
105        target table. This temporary table is deleted after the merge is complete or after it's expiration time has
106        passed.
107        """
108        table = exp.to_table(table_name)
109        query: t.Union[Query, exp.Select]
110        is_pandas = isinstance(query_or_df, pd.DataFrame)
111        temp_table_name: t.Optional[str] = None
112        if isinstance(query_or_df, DF_TYPES):
113            if not is_pandas:
114                raise SQLMeshError("BigQuery only supports pandas DataFrames")
115            if columns_to_types is None:
116                raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames")
117            if table.db is None:
118                raise SQLMeshError("table_name must be qualified when using Pandas DataFrames")
119            query_or_df = t.cast(pd.DataFrame, query_or_df)
120            result, temp_table_name = self.__load_pandas_to_temp_table(
121                table, query_or_df, columns_to_types
122            )
123            if result.errors:
124                raise SQLMeshError(result.errors)
125            query = exp.select(*columns_to_types).from_(exp.to_table(temp_table_name))
126        else:
127            query = t.cast(Query, query_or_df)
128        columns = [
129            exp.to_column(col)
130            for col in (columns_to_types or [col.alias_or_name for col in query.expressions])
131        ]
132        when_not_matched_by_source = exp.When(
133            matched=False,
134            source=True,
135            condition=where,
136            then=exp.Delete(),
137        )
138        when_not_matched_by_target = exp.When(
139            matched=False,
140            source=False,
141            then=exp.Insert(
142                this=exp.Tuple(expressions=columns),
143                expression=exp.Tuple(expressions=columns),
144            ),
145        )
146        self._merge(
147            target_table=table,
148            source_table=query,
149            on=exp.false(),
150            match_expressions=[when_not_matched_by_source, when_not_matched_by_target],
151        )
152        if is_pandas:
153            assert temp_table_name is not None
154            self.drop_table(temp_table_name)
155
156    def table_exists(self, table_name: TableName) -> bool:
157        from google.cloud.exceptions import NotFound
158
159        try:
160            self._get_table(table_name)
161            return True
162        except NotFound:
163            return False
164
165    def _get_table(self, table_name: TableName) -> BigQueryTable:
166        """
167        Returns a BigQueryTable object for the given table name.
168
169        Raises: `google.cloud.exceptions.NotFound` if the table does not exist.
170        """
171        if isinstance(table_name, exp.Table):
172            table_name = table_name.sql(dialect=self.dialect)
173
174        return self.client.get_table(table_name)
175
176    def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> DF:
177        self.execute(query)
178        return self.cursor._query_job.to_dataframe()
179
180    def _create_table_properties(
181        self,
182        storage_format: t.Optional[str] = None,
183        partitioned_by: t.Optional[t.List[str]] = None,
184        partition_interval_unit: t.Optional[IntervalUnit] = None,
185    ) -> t.Optional[exp.Properties]:
186        if not partitioned_by:
187            return None
188        if partition_interval_unit is None:
189            raise SQLMeshError("partition_interval_unit is required when partitioning a table")
190        if partition_interval_unit == IntervalUnit.MINUTE:
191            raise SQLMeshError("BigQuery does not support partitioning by minute")
192        if len(partitioned_by) > 1:
193            raise SQLMeshError("BigQuery only supports partitioning by a single column")
194        partition_col = exp.to_column(partitioned_by[0])
195        this: t.Union[exp.Func, exp.Column]
196        if partition_interval_unit == IntervalUnit.HOUR:
197            this = exp.func(
198                "TIMESTAMP_TRUNC",
199                partition_col,
200                exp.var(IntervalUnit.HOUR.value.upper()),
201                dialect=self.dialect,
202            )
203        else:
204            this = partition_col
205
206        partition_columns_property = exp.PartitionedByProperty(this=this)
207        return exp.Properties(expressions=[partition_columns_property])
208
209    def create_state_table(
210        self,
211        table_name: str,
212        columns_to_types: t.Dict[str, exp.DataType],
213        primary_key: t.Optional[t.Tuple[str, ...]] = None,
214    ) -> None:
215        self.create_table(
216            table_name,
217            columns_to_types,
218        )
219
220    def supports_transactions(self, transaction_type: TransactionType) -> bool:
221        return False
222
223    def _get_data_objects(
224        self, schema_name: str, catalog_name: t.Optional[str] = None
225    ) -> t.List[DataObject]:
226        """
227        Returns all the data objects that exist in the given schema and optionally catalog.
228        """
229        from google.cloud.bigquery import DatasetReference
230
231        dataset_ref = DatasetReference(
232            project=catalog_name or self.client.project, dataset_id=schema_name
233        )
234        all_tables = self.client.list_tables(dataset_ref)
235        return [
236            DataObject(
237                catalog=table.project,
238                schema=table.dataset_id,
239                name=table.table_id,
240                type=DataObjectType.from_str(table.table_type),
241            )
242            for table in all_tables
243        ]

Base class wrapping a Database API compliant connection.

The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.

Arguments:
  • connection_factory: a callable which produces a new Database API-compliant connection on every call.
  • dialect: The dialect with which this adapter is associated.
  • multithreaded: Indicates whether this adapter will be used by more than one thread.
def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None:
45    def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None:
46        """Create a schema from a name or qualified table name."""
47        from google.cloud.bigquery.dbapi.exceptions import DatabaseError
48
49        try:
50            super().create_schema(schema_name, ignore_if_exists=ignore_if_exists)
51        except DatabaseError as e:
52            for arg in e.args:
53                if ignore_if_exists and "Already Exists: " in arg.message:
54                    return
55            raise e

Create a schema from a name or qualified table name.

def columns(self, table_name: <MagicMock id='5996102416'>) -> Dict[str, str]:
57    def columns(self, table_name: TableName) -> t.Dict[str, str]:
58        """Fetches column names and types for the target table."""
59        table = self._get_table(table_name)
60        return {field.name: field.field_type for field in table.schema}

Fetches column names and types for the target table.

def table_exists(self, table_name: <MagicMock id='5996112160'>) -> bool:
156    def table_exists(self, table_name: TableName) -> bool:
157        from google.cloud.exceptions import NotFound
158
159        try:
160            self._get_table(table_name)
161            return True
162        except NotFound:
163            return False
def create_state_table( self, table_name: str, columns_to_types: Dict[str, sqlglot.expressions.DataType], primary_key: Optional[Tuple[str, ...]] = None) -> None:
209    def create_state_table(
210        self,
211        table_name: str,
212        columns_to_types: t.Dict[str, exp.DataType],
213        primary_key: t.Optional[t.Tuple[str, ...]] = None,
214    ) -> None:
215        self.create_table(
216            table_name,
217            columns_to_types,
218        )

Create a table to store SQLMesh internal state.

Arguments:
  • table_name: The name of the table to create. Can be fully qualified or just table name.
  • columns_to_types: A mapping between the column name and its data type.
  • primary_key: Determines the table primary key.
def supports_transactions( self, transaction_type: sqlmesh.core.engine_adapter.shared.TransactionType) -> bool:
220    def supports_transactions(self, transaction_type: TransactionType) -> bool:
221        return False

Whether or not the engine adapter supports transactions for the given transaction type.