sqlmesh.core.engine_adapter.bigquery
1from __future__ import annotations 2 3import typing as t 4import uuid 5 6import pandas as pd 7from sqlglot import exp 8from sqlglot.transforms import remove_precision_parameterized_types 9 10from sqlmesh.core.engine_adapter._typing import DF_TYPES, Query 11from sqlmesh.core.engine_adapter.base import EngineAdapter 12from sqlmesh.core.engine_adapter.shared import ( 13 DataObject, 14 DataObjectType, 15 TransactionType, 16) 17from sqlmesh.core.model.meta import IntervalUnit 18from sqlmesh.utils.date import to_datetime 19from sqlmesh.utils.errors import SQLMeshError 20 21if t.TYPE_CHECKING: 22 from google.cloud.bigquery.client import Client as BigQueryClient 23 from google.cloud.bigquery.client import Connection as BigQueryConnection 24 from google.cloud.bigquery.job.base import _AsyncJob as BigQueryQueryResult 25 from google.cloud.bigquery.table import Table as BigQueryTable 26 27 from sqlmesh.core._typing import TableName 28 from sqlmesh.core.engine_adapter._typing import DF, QueryOrDF 29 30 31class BigQueryEngineAdapter(EngineAdapter): 32 DIALECT = "bigquery" 33 DEFAULT_BATCH_SIZE = 1000 34 ESCAPE_JSON = True 35 36 @property 37 def client(self) -> BigQueryClient: 38 return self.cursor.connection._client 39 40 @property 41 def connection(self) -> BigQueryConnection: 42 return self.cursor.connection 43 44 def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None: 45 """Create a schema from a name or qualified table name.""" 46 from google.cloud.bigquery.dbapi.exceptions import DatabaseError 47 48 try: 49 super().create_schema(schema_name, ignore_if_exists=ignore_if_exists) 50 except DatabaseError as e: 51 for arg in e.args: 52 if ignore_if_exists and "Already Exists: " in arg.message: 53 return 54 raise e 55 56 def columns(self, table_name: TableName) -> t.Dict[str, str]: 57 """Fetches column names and types for the target table.""" 58 table = self._get_table(table_name) 59 return {field.name: field.field_type for field in table.schema} 60 61 def __load_pandas_to_temp_table( 62 self, 63 table: TableName, 64 df: pd.DataFrame, 65 columns_to_types: t.Dict[str, exp.DataType], 66 ) -> t.Tuple[BigQueryQueryResult, str]: 67 """ 68 Loads a pandas dataframe into a temporary table in BigQuery. Returns the result of the load and the name of the 69 temporary table. The temporary table will be deleted after 3 hours. 70 """ 71 from google.cloud import bigquery 72 73 table = exp.to_table(table) 74 precisionless_col_to_types = { 75 col_name: remove_precision_parameterized_types(col_type) 76 for col_name, col_type in columns_to_types.items() 77 } 78 temp_table_name = f"{self.client.project}.{table.db}.__temp_{table.name}_{uuid.uuid4().hex}" 79 schema = [ 80 bigquery.SchemaField(col_name, col_type.sql(dialect=self.dialect)) 81 for col_name, col_type in precisionless_col_to_types.items() 82 ] 83 bq_table = bigquery.Table(table_ref=temp_table_name, schema=schema) 84 bq_table.expires = to_datetime("in 3 hours") 85 self.client.create_table(bq_table) 86 result = self.client.load_table_from_dataframe(df, bq_table).result() 87 if result.errors: 88 raise SQLMeshError(result.errors) 89 return result, temp_table_name 90 91 def _insert_overwrite_by_condition( 92 self, 93 table_name: TableName, 94 query_or_df: QueryOrDF, 95 where: t.Optional[exp.Condition] = None, 96 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 97 ) -> None: 98 """ 99 Bigquery does not directly support `INSERT OVERWRITE` but it does support `MERGE` with a `False` 100 condition and delete that mimics an `INSERT OVERWRITE`. Based on documentation this should have the 101 same runtime performance as `INSERT OVERWRITE`. 102 103 If a Pandas DataFrame is provided, it will be loaded into a temporary table and then merged with the 104 target table. This temporary table is deleted after the merge is complete or after it's expiration time has 105 passed. 106 """ 107 table = exp.to_table(table_name) 108 query: t.Union[Query, exp.Select] 109 is_pandas = isinstance(query_or_df, pd.DataFrame) 110 temp_table_name: t.Optional[str] = None 111 if isinstance(query_or_df, DF_TYPES): 112 if not is_pandas: 113 raise SQLMeshError("BigQuery only supports pandas DataFrames") 114 if columns_to_types is None: 115 raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames") 116 if table.db is None: 117 raise SQLMeshError("table_name must be qualified when using Pandas DataFrames") 118 query_or_df = t.cast(pd.DataFrame, query_or_df) 119 result, temp_table_name = self.__load_pandas_to_temp_table( 120 table, query_or_df, columns_to_types 121 ) 122 if result.errors: 123 raise SQLMeshError(result.errors) 124 query = exp.select(*columns_to_types).from_(exp.to_table(temp_table_name)) 125 else: 126 query = t.cast(Query, query_or_df) 127 columns = [ 128 exp.to_column(col) 129 for col in (columns_to_types or [col.alias_or_name for col in query.expressions]) 130 ] 131 when_not_matched_by_source = exp.When( 132 matched=False, 133 source=True, 134 condition=where, 135 then=exp.Delete(), 136 ) 137 when_not_matched_by_target = exp.When( 138 matched=False, 139 source=False, 140 then=exp.Insert( 141 this=exp.Tuple(expressions=columns), 142 expression=exp.Tuple(expressions=columns), 143 ), 144 ) 145 self._merge( 146 target_table=table, 147 source_table=query, 148 on=exp.false(), 149 match_expressions=[when_not_matched_by_source, when_not_matched_by_target], 150 ) 151 if is_pandas: 152 assert temp_table_name is not None 153 self.drop_table(temp_table_name) 154 155 def table_exists(self, table_name: TableName) -> bool: 156 from google.cloud.exceptions import NotFound 157 158 try: 159 self._get_table(table_name) 160 return True 161 except NotFound: 162 return False 163 164 def _get_table(self, table_name: TableName) -> BigQueryTable: 165 """ 166 Returns a BigQueryTable object for the given table name. 167 168 Raises: `google.cloud.exceptions.NotFound` if the table does not exist. 169 """ 170 if isinstance(table_name, exp.Table): 171 table_name = table_name.sql(dialect=self.dialect) 172 173 return self.client.get_table(table_name) 174 175 def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> DF: 176 self.execute(query) 177 return self.cursor._query_job.to_dataframe() 178 179 def _create_table_properties( 180 self, 181 storage_format: t.Optional[str] = None, 182 partitioned_by: t.Optional[t.List[str]] = None, 183 partition_interval_unit: t.Optional[IntervalUnit] = None, 184 ) -> t.Optional[exp.Properties]: 185 if not partitioned_by: 186 return None 187 if partition_interval_unit is None: 188 raise SQLMeshError("partition_interval_unit is required when partitioning a table") 189 if partition_interval_unit == IntervalUnit.MINUTE: 190 raise SQLMeshError("BigQuery does not support partitioning by minute") 191 if len(partitioned_by) > 1: 192 raise SQLMeshError("BigQuery only supports partitioning by a single column") 193 partition_col = exp.to_column(partitioned_by[0]) 194 this: t.Union[exp.Func, exp.Column] 195 if partition_interval_unit == IntervalUnit.HOUR: 196 this = exp.func( 197 "TIMESTAMP_TRUNC", 198 partition_col, 199 exp.var(IntervalUnit.HOUR.value.upper()), 200 dialect=self.dialect, 201 ) 202 else: 203 this = partition_col 204 205 partition_columns_property = exp.PartitionedByProperty(this=this) 206 return exp.Properties(expressions=[partition_columns_property]) 207 208 def create_state_table( 209 self, 210 table_name: str, 211 columns_to_types: t.Dict[str, exp.DataType], 212 primary_key: t.Optional[t.Tuple[str, ...]] = None, 213 ) -> None: 214 self.create_table( 215 table_name, 216 columns_to_types, 217 ) 218 219 def supports_transactions(self, transaction_type: TransactionType) -> bool: 220 return False 221 222 def _get_data_objects( 223 self, schema_name: str, catalog_name: t.Optional[str] = None 224 ) -> t.List[DataObject]: 225 """ 226 Returns all the data objects that exist in the given schema and optionally catalog. 227 """ 228 from google.cloud.bigquery import DatasetReference 229 230 dataset_ref = DatasetReference( 231 project=catalog_name or self.client.project, dataset_id=schema_name 232 ) 233 all_tables = self.client.list_tables(dataset_ref) 234 return [ 235 DataObject( 236 catalog=table.project, 237 schema=table.dataset_id, 238 name=table.table_id, 239 type=DataObjectType.from_str(table.table_type), 240 ) 241 for table in all_tables 242 ]
32class BigQueryEngineAdapter(EngineAdapter): 33 DIALECT = "bigquery" 34 DEFAULT_BATCH_SIZE = 1000 35 ESCAPE_JSON = True 36 37 @property 38 def client(self) -> BigQueryClient: 39 return self.cursor.connection._client 40 41 @property 42 def connection(self) -> BigQueryConnection: 43 return self.cursor.connection 44 45 def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None: 46 """Create a schema from a name or qualified table name.""" 47 from google.cloud.bigquery.dbapi.exceptions import DatabaseError 48 49 try: 50 super().create_schema(schema_name, ignore_if_exists=ignore_if_exists) 51 except DatabaseError as e: 52 for arg in e.args: 53 if ignore_if_exists and "Already Exists: " in arg.message: 54 return 55 raise e 56 57 def columns(self, table_name: TableName) -> t.Dict[str, str]: 58 """Fetches column names and types for the target table.""" 59 table = self._get_table(table_name) 60 return {field.name: field.field_type for field in table.schema} 61 62 def __load_pandas_to_temp_table( 63 self, 64 table: TableName, 65 df: pd.DataFrame, 66 columns_to_types: t.Dict[str, exp.DataType], 67 ) -> t.Tuple[BigQueryQueryResult, str]: 68 """ 69 Loads a pandas dataframe into a temporary table in BigQuery. Returns the result of the load and the name of the 70 temporary table. The temporary table will be deleted after 3 hours. 71 """ 72 from google.cloud import bigquery 73 74 table = exp.to_table(table) 75 precisionless_col_to_types = { 76 col_name: remove_precision_parameterized_types(col_type) 77 for col_name, col_type in columns_to_types.items() 78 } 79 temp_table_name = f"{self.client.project}.{table.db}.__temp_{table.name}_{uuid.uuid4().hex}" 80 schema = [ 81 bigquery.SchemaField(col_name, col_type.sql(dialect=self.dialect)) 82 for col_name, col_type in precisionless_col_to_types.items() 83 ] 84 bq_table = bigquery.Table(table_ref=temp_table_name, schema=schema) 85 bq_table.expires = to_datetime("in 3 hours") 86 self.client.create_table(bq_table) 87 result = self.client.load_table_from_dataframe(df, bq_table).result() 88 if result.errors: 89 raise SQLMeshError(result.errors) 90 return result, temp_table_name 91 92 def _insert_overwrite_by_condition( 93 self, 94 table_name: TableName, 95 query_or_df: QueryOrDF, 96 where: t.Optional[exp.Condition] = None, 97 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 98 ) -> None: 99 """ 100 Bigquery does not directly support `INSERT OVERWRITE` but it does support `MERGE` with a `False` 101 condition and delete that mimics an `INSERT OVERWRITE`. Based on documentation this should have the 102 same runtime performance as `INSERT OVERWRITE`. 103 104 If a Pandas DataFrame is provided, it will be loaded into a temporary table and then merged with the 105 target table. This temporary table is deleted after the merge is complete or after it's expiration time has 106 passed. 107 """ 108 table = exp.to_table(table_name) 109 query: t.Union[Query, exp.Select] 110 is_pandas = isinstance(query_or_df, pd.DataFrame) 111 temp_table_name: t.Optional[str] = None 112 if isinstance(query_or_df, DF_TYPES): 113 if not is_pandas: 114 raise SQLMeshError("BigQuery only supports pandas DataFrames") 115 if columns_to_types is None: 116 raise SQLMeshError("columns_to_types must be provided when using Pandas DataFrames") 117 if table.db is None: 118 raise SQLMeshError("table_name must be qualified when using Pandas DataFrames") 119 query_or_df = t.cast(pd.DataFrame, query_or_df) 120 result, temp_table_name = self.__load_pandas_to_temp_table( 121 table, query_or_df, columns_to_types 122 ) 123 if result.errors: 124 raise SQLMeshError(result.errors) 125 query = exp.select(*columns_to_types).from_(exp.to_table(temp_table_name)) 126 else: 127 query = t.cast(Query, query_or_df) 128 columns = [ 129 exp.to_column(col) 130 for col in (columns_to_types or [col.alias_or_name for col in query.expressions]) 131 ] 132 when_not_matched_by_source = exp.When( 133 matched=False, 134 source=True, 135 condition=where, 136 then=exp.Delete(), 137 ) 138 when_not_matched_by_target = exp.When( 139 matched=False, 140 source=False, 141 then=exp.Insert( 142 this=exp.Tuple(expressions=columns), 143 expression=exp.Tuple(expressions=columns), 144 ), 145 ) 146 self._merge( 147 target_table=table, 148 source_table=query, 149 on=exp.false(), 150 match_expressions=[when_not_matched_by_source, when_not_matched_by_target], 151 ) 152 if is_pandas: 153 assert temp_table_name is not None 154 self.drop_table(temp_table_name) 155 156 def table_exists(self, table_name: TableName) -> bool: 157 from google.cloud.exceptions import NotFound 158 159 try: 160 self._get_table(table_name) 161 return True 162 except NotFound: 163 return False 164 165 def _get_table(self, table_name: TableName) -> BigQueryTable: 166 """ 167 Returns a BigQueryTable object for the given table name. 168 169 Raises: `google.cloud.exceptions.NotFound` if the table does not exist. 170 """ 171 if isinstance(table_name, exp.Table): 172 table_name = table_name.sql(dialect=self.dialect) 173 174 return self.client.get_table(table_name) 175 176 def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> DF: 177 self.execute(query) 178 return self.cursor._query_job.to_dataframe() 179 180 def _create_table_properties( 181 self, 182 storage_format: t.Optional[str] = None, 183 partitioned_by: t.Optional[t.List[str]] = None, 184 partition_interval_unit: t.Optional[IntervalUnit] = None, 185 ) -> t.Optional[exp.Properties]: 186 if not partitioned_by: 187 return None 188 if partition_interval_unit is None: 189 raise SQLMeshError("partition_interval_unit is required when partitioning a table") 190 if partition_interval_unit == IntervalUnit.MINUTE: 191 raise SQLMeshError("BigQuery does not support partitioning by minute") 192 if len(partitioned_by) > 1: 193 raise SQLMeshError("BigQuery only supports partitioning by a single column") 194 partition_col = exp.to_column(partitioned_by[0]) 195 this: t.Union[exp.Func, exp.Column] 196 if partition_interval_unit == IntervalUnit.HOUR: 197 this = exp.func( 198 "TIMESTAMP_TRUNC", 199 partition_col, 200 exp.var(IntervalUnit.HOUR.value.upper()), 201 dialect=self.dialect, 202 ) 203 else: 204 this = partition_col 205 206 partition_columns_property = exp.PartitionedByProperty(this=this) 207 return exp.Properties(expressions=[partition_columns_property]) 208 209 def create_state_table( 210 self, 211 table_name: str, 212 columns_to_types: t.Dict[str, exp.DataType], 213 primary_key: t.Optional[t.Tuple[str, ...]] = None, 214 ) -> None: 215 self.create_table( 216 table_name, 217 columns_to_types, 218 ) 219 220 def supports_transactions(self, transaction_type: TransactionType) -> bool: 221 return False 222 223 def _get_data_objects( 224 self, schema_name: str, catalog_name: t.Optional[str] = None 225 ) -> t.List[DataObject]: 226 """ 227 Returns all the data objects that exist in the given schema and optionally catalog. 228 """ 229 from google.cloud.bigquery import DatasetReference 230 231 dataset_ref = DatasetReference( 232 project=catalog_name or self.client.project, dataset_id=schema_name 233 ) 234 all_tables = self.client.list_tables(dataset_ref) 235 return [ 236 DataObject( 237 catalog=table.project, 238 schema=table.dataset_id, 239 name=table.table_id, 240 type=DataObjectType.from_str(table.table_type), 241 ) 242 for table in all_tables 243 ]
Base class wrapping a Database API compliant connection.
The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.
Arguments:
- connection_factory: a callable which produces a new Database API-compliant connection on every call.
- dialect: The dialect with which this adapter is associated.
- multithreaded: Indicates whether this adapter will be used by more than one thread.
def
create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None:
45 def create_schema(self, schema_name: str, ignore_if_exists: bool = True) -> None: 46 """Create a schema from a name or qualified table name.""" 47 from google.cloud.bigquery.dbapi.exceptions import DatabaseError 48 49 try: 50 super().create_schema(schema_name, ignore_if_exists=ignore_if_exists) 51 except DatabaseError as e: 52 for arg in e.args: 53 if ignore_if_exists and "Already Exists: " in arg.message: 54 return 55 raise e
Create a schema from a name or qualified table name.
def
columns(self, table_name: <MagicMock id='5996102416'>) -> Dict[str, str]:
57 def columns(self, table_name: TableName) -> t.Dict[str, str]: 58 """Fetches column names and types for the target table.""" 59 table = self._get_table(table_name) 60 return {field.name: field.field_type for field in table.schema}
Fetches column names and types for the target table.
def
create_state_table( self, table_name: str, columns_to_types: Dict[str, sqlglot.expressions.DataType], primary_key: Optional[Tuple[str, ...]] = None) -> None:
209 def create_state_table( 210 self, 211 table_name: str, 212 columns_to_types: t.Dict[str, exp.DataType], 213 primary_key: t.Optional[t.Tuple[str, ...]] = None, 214 ) -> None: 215 self.create_table( 216 table_name, 217 columns_to_types, 218 )
Create a table to store SQLMesh internal state.
Arguments:
- table_name: The name of the table to create. Can be fully qualified or just table name.
- columns_to_types: A mapping between the column name and its data type.
- primary_key: Determines the table primary key.
def
supports_transactions( self, transaction_type: sqlmesh.core.engine_adapter.shared.TransactionType) -> bool:
Whether or not the engine adapter supports transactions for the given transaction type.
Inherited Members
- sqlmesh.core.engine_adapter.base.EngineAdapter
- EngineAdapter
- recycle
- close
- replace_query
- create_index
- create_table
- create_table_like
- drop_table
- alter_table
- create_view
- drop_schema
- drop_view
- delete_from
- insert_append
- insert_overwrite_by_time_partition
- update_table
- merge
- rename_table
- fetchone
- fetchall
- fetchdf
- fetch_pyspark_df
- transaction
- execute