sqlmesh.core.engine_adapter.redshift
1from __future__ import annotations 2 3import typing as t 4import uuid 5 6import pandas as pd 7from sqlglot import exp 8 9from sqlmesh.core.dialect import pandas_to_sql 10from sqlmesh.core.engine_adapter._typing import DF_TYPES, Query 11from sqlmesh.core.engine_adapter.base import EngineAdapter 12from sqlmesh.core.engine_adapter.shared import DataObject 13 14if t.TYPE_CHECKING: 15 from sqlmesh.core._typing import TableName 16 from sqlmesh.core.engine_adapter._typing import QueryOrDF 17 18 19class RedshiftEngineAdapter(EngineAdapter): 20 DIALECT = "redshift" 21 DEFAULT_BATCH_SIZE = 1000 22 23 @property 24 def cursor(self) -> t.Any: 25 connection = self._connection_pool.get() 26 # The SQLMesh implementation relies on autocommit being set to True 27 connection.autocommit = True 28 cursor = self._connection_pool.get_cursor() 29 # Redshift by default uses a `format` paramstyle that has issues when we try to write our snapshot 30 # data to snapshot table. There doesn't seem to be a way to disable parameter overriding so we just 31 # set it to `qmark` since that doesn't cause issues. 32 cursor.paramstyle = "qmark" 33 return cursor 34 35 def create_view( 36 self, 37 view_name: TableName, 38 query_or_df: QueryOrDF, 39 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 40 replace: bool = True, 41 **create_kwargs: t.Any, 42 ) -> None: 43 """ 44 Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot 45 performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't 46 support views for Redshift until sqlglot is updated to performantly support large union statements. 47 48 Also Redshift views are "binding" by default to their underlying table which means you can't drop that 49 underlying table without dropping the view first. This is a problem for us since we want to be able to 50 swap tables out from under views. Therefore we create the view as non-binding. 51 """ 52 if isinstance(query_or_df, DF_TYPES): 53 raise NotImplementedError( 54 "DataFrames are not supported for Redshift views because Redshift doesn't" 55 "support using `VALUES` in a `CREATE VIEW` statement." 56 ) 57 return super().create_view( 58 view_name, query_or_df, columns_to_types, replace, no_schema_binding=True 59 ) 60 61 def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame: 62 """Fetches a Pandas DataFrame from the cursor""" 63 self.execute(query) 64 return self.cursor.fetch_dataframe() 65 66 def _create_table_from_query( 67 self, 68 table_name: TableName, 69 query: Query, 70 exists: bool = True, 71 **kwargs: t.Any, 72 ) -> t.Optional[exp.Create]: 73 """ 74 Redshift doesn't support `CREATE TABLE IF NOT EXISTS AS...` but does support `CREATE TABLE AS...` so 75 we check if the exists check exists and if not then we can use the base implementation. Otherwise we 76 manually check if it exists and if it does then this is a no-op anyways so we return and if it doesn't 77 then we run the query with exists set to False since we just confirmed it doesn't exist. 78 """ 79 if not exists: 80 return super()._create_table_from_query(table_name, query, exists, **kwargs) 81 if self.table_exists(table_name): 82 return None 83 return self._create_table_from_query(table_name, query, exists=False, **kwargs) 84 85 @classmethod 86 def _pandas_to_sql( 87 cls, 88 df: pd.DataFrame, 89 columns_to_types: t.Dict[str, exp.DataType], 90 batch_size: int = 0, 91 alias: str = "t", 92 ) -> t.Generator[exp.Select, None, None]: 93 """ 94 Extracts the `VALUES` expression from the SELECT statement and also removes the alias. 95 """ 96 for expression in pandas_to_sql(df, columns_to_types, batch_size, alias): 97 values_expression = t.cast(exp.Select, expression.find(exp.Values)) 98 values_expression.parent = None 99 values_expression.set("alias", None) 100 yield values_expression 101 102 def replace_query( 103 self, 104 table_name: TableName, 105 query_or_df: QueryOrDF, 106 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 107 ) -> None: 108 """ 109 Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially 110 handle DataFrame replacements. 111 112 If the table doesn't exist then we just create it and load it with insert statements 113 If it does exist then we need to do the: 114 `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...` dance. 115 """ 116 if not isinstance(query_or_df, pd.DataFrame): 117 return super().replace_query(table_name, query_or_df, columns_to_types) 118 if not columns_to_types: 119 raise ValueError("columns_to_types must be provided for dataframes") 120 target_table = exp.to_table(table_name) 121 target_exists = self.table_exists(target_table) 122 if target_exists: 123 with self.transaction(): 124 temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}" 125 temp_table = target_table.copy() 126 temp_table.set("this", exp.to_identifier(temp_table_name)) 127 old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}" 128 old_table = target_table.copy() 129 old_table.set("this", exp.to_identifier(old_table_name)) 130 self.create_table(temp_table, columns_to_types, exists=False) 131 for expression in self._pandas_to_sql( 132 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 133 ): 134 self._insert_append_query(temp_table, expression, columns_to_types) 135 self.rename_table(target_table, old_table) 136 self.rename_table(temp_table, target_table) 137 self.drop_table(old_table) 138 else: 139 self.create_table(target_table, columns_to_types, exists=False) 140 for expression in self._pandas_to_sql( 141 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 142 ): 143 self._insert_append_query(target_table, expression, columns_to_types) 144 145 def _short_hash(self) -> str: 146 return uuid.uuid4().hex[:8] 147 148 def table_exists(self, table_name: TableName) -> bool: 149 """ 150 Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table 151 exists. We don't use this directly because we still want all execution to go through our execute method 152 153 Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553 154 """ 155 table = exp.to_table(table_name) 156 157 # Redshift doesn't support catalog 158 if table.args.get("catalog"): 159 return False 160 161 q: str = ( 162 f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'" 163 ) 164 database_name = table.args.get("db") 165 if database_name: 166 q += f" AND table_schema = '{database_name}'" 167 168 self.execute(q) 169 170 result = self.cursor.fetchone() 171 172 return result[0] == 1 if result is not None else False 173 174 def _get_data_objects( 175 self, schema_name: str, catalog_name: t.Optional[str] = None 176 ) -> t.List[DataObject]: 177 """ 178 Returns all the data objects that exist in the given schema and optionally catalog. 179 """ 180 catalog_name = f"'{catalog_name}'" if catalog_name else "NULL" 181 query = f""" 182 SELECT 183 {catalog_name} AS catalog_name, 184 tablename AS name, 185 schemaname AS schema_name, 186 'TABLE' AS type 187 FROM pg_tables 188 WHERE schemaname ILIKE '{schema_name}' 189 UNION ALL 190 SELECT 191 {catalog_name} AS catalog_name, 192 viewname AS name, 193 schemaname AS schema_name, 194 'VIEW' AS type 195 FROM pg_views 196 WHERE schemaname ILIKE '{schema_name}' 197 """ 198 df = self.fetchdf(query) 199 return [ 200 DataObject( 201 catalog=row.catalog_name, schema=row.schema_name, name=row.name, type=row.type # type: ignore 202 ) 203 for row in df.itertuples() 204 ]
20class RedshiftEngineAdapter(EngineAdapter): 21 DIALECT = "redshift" 22 DEFAULT_BATCH_SIZE = 1000 23 24 @property 25 def cursor(self) -> t.Any: 26 connection = self._connection_pool.get() 27 # The SQLMesh implementation relies on autocommit being set to True 28 connection.autocommit = True 29 cursor = self._connection_pool.get_cursor() 30 # Redshift by default uses a `format` paramstyle that has issues when we try to write our snapshot 31 # data to snapshot table. There doesn't seem to be a way to disable parameter overriding so we just 32 # set it to `qmark` since that doesn't cause issues. 33 cursor.paramstyle = "qmark" 34 return cursor 35 36 def create_view( 37 self, 38 view_name: TableName, 39 query_or_df: QueryOrDF, 40 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 41 replace: bool = True, 42 **create_kwargs: t.Any, 43 ) -> None: 44 """ 45 Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot 46 performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't 47 support views for Redshift until sqlglot is updated to performantly support large union statements. 48 49 Also Redshift views are "binding" by default to their underlying table which means you can't drop that 50 underlying table without dropping the view first. This is a problem for us since we want to be able to 51 swap tables out from under views. Therefore we create the view as non-binding. 52 """ 53 if isinstance(query_or_df, DF_TYPES): 54 raise NotImplementedError( 55 "DataFrames are not supported for Redshift views because Redshift doesn't" 56 "support using `VALUES` in a `CREATE VIEW` statement." 57 ) 58 return super().create_view( 59 view_name, query_or_df, columns_to_types, replace, no_schema_binding=True 60 ) 61 62 def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame: 63 """Fetches a Pandas DataFrame from the cursor""" 64 self.execute(query) 65 return self.cursor.fetch_dataframe() 66 67 def _create_table_from_query( 68 self, 69 table_name: TableName, 70 query: Query, 71 exists: bool = True, 72 **kwargs: t.Any, 73 ) -> t.Optional[exp.Create]: 74 """ 75 Redshift doesn't support `CREATE TABLE IF NOT EXISTS AS...` but does support `CREATE TABLE AS...` so 76 we check if the exists check exists and if not then we can use the base implementation. Otherwise we 77 manually check if it exists and if it does then this is a no-op anyways so we return and if it doesn't 78 then we run the query with exists set to False since we just confirmed it doesn't exist. 79 """ 80 if not exists: 81 return super()._create_table_from_query(table_name, query, exists, **kwargs) 82 if self.table_exists(table_name): 83 return None 84 return self._create_table_from_query(table_name, query, exists=False, **kwargs) 85 86 @classmethod 87 def _pandas_to_sql( 88 cls, 89 df: pd.DataFrame, 90 columns_to_types: t.Dict[str, exp.DataType], 91 batch_size: int = 0, 92 alias: str = "t", 93 ) -> t.Generator[exp.Select, None, None]: 94 """ 95 Extracts the `VALUES` expression from the SELECT statement and also removes the alias. 96 """ 97 for expression in pandas_to_sql(df, columns_to_types, batch_size, alias): 98 values_expression = t.cast(exp.Select, expression.find(exp.Values)) 99 values_expression.parent = None 100 values_expression.set("alias", None) 101 yield values_expression 102 103 def replace_query( 104 self, 105 table_name: TableName, 106 query_or_df: QueryOrDF, 107 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 108 ) -> None: 109 """ 110 Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially 111 handle DataFrame replacements. 112 113 If the table doesn't exist then we just create it and load it with insert statements 114 If it does exist then we need to do the: 115 `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...` dance. 116 """ 117 if not isinstance(query_or_df, pd.DataFrame): 118 return super().replace_query(table_name, query_or_df, columns_to_types) 119 if not columns_to_types: 120 raise ValueError("columns_to_types must be provided for dataframes") 121 target_table = exp.to_table(table_name) 122 target_exists = self.table_exists(target_table) 123 if target_exists: 124 with self.transaction(): 125 temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}" 126 temp_table = target_table.copy() 127 temp_table.set("this", exp.to_identifier(temp_table_name)) 128 old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}" 129 old_table = target_table.copy() 130 old_table.set("this", exp.to_identifier(old_table_name)) 131 self.create_table(temp_table, columns_to_types, exists=False) 132 for expression in self._pandas_to_sql( 133 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 134 ): 135 self._insert_append_query(temp_table, expression, columns_to_types) 136 self.rename_table(target_table, old_table) 137 self.rename_table(temp_table, target_table) 138 self.drop_table(old_table) 139 else: 140 self.create_table(target_table, columns_to_types, exists=False) 141 for expression in self._pandas_to_sql( 142 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 143 ): 144 self._insert_append_query(target_table, expression, columns_to_types) 145 146 def _short_hash(self) -> str: 147 return uuid.uuid4().hex[:8] 148 149 def table_exists(self, table_name: TableName) -> bool: 150 """ 151 Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table 152 exists. We don't use this directly because we still want all execution to go through our execute method 153 154 Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553 155 """ 156 table = exp.to_table(table_name) 157 158 # Redshift doesn't support catalog 159 if table.args.get("catalog"): 160 return False 161 162 q: str = ( 163 f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'" 164 ) 165 database_name = table.args.get("db") 166 if database_name: 167 q += f" AND table_schema = '{database_name}'" 168 169 self.execute(q) 170 171 result = self.cursor.fetchone() 172 173 return result[0] == 1 if result is not None else False 174 175 def _get_data_objects( 176 self, schema_name: str, catalog_name: t.Optional[str] = None 177 ) -> t.List[DataObject]: 178 """ 179 Returns all the data objects that exist in the given schema and optionally catalog. 180 """ 181 catalog_name = f"'{catalog_name}'" if catalog_name else "NULL" 182 query = f""" 183 SELECT 184 {catalog_name} AS catalog_name, 185 tablename AS name, 186 schemaname AS schema_name, 187 'TABLE' AS type 188 FROM pg_tables 189 WHERE schemaname ILIKE '{schema_name}' 190 UNION ALL 191 SELECT 192 {catalog_name} AS catalog_name, 193 viewname AS name, 194 schemaname AS schema_name, 195 'VIEW' AS type 196 FROM pg_views 197 WHERE schemaname ILIKE '{schema_name}' 198 """ 199 df = self.fetchdf(query) 200 return [ 201 DataObject( 202 catalog=row.catalog_name, schema=row.schema_name, name=row.name, type=row.type # type: ignore 203 ) 204 for row in df.itertuples() 205 ]
Base class wrapping a Database API compliant connection.
The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.
Arguments:
- connection_factory: a callable which produces a new Database API-compliant connection on every call.
- dialect: The dialect with which this adapter is associated.
- multithreaded: Indicates whether this adapter will be used by more than one thread.
36 def create_view( 37 self, 38 view_name: TableName, 39 query_or_df: QueryOrDF, 40 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 41 replace: bool = True, 42 **create_kwargs: t.Any, 43 ) -> None: 44 """ 45 Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot 46 performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't 47 support views for Redshift until sqlglot is updated to performantly support large union statements. 48 49 Also Redshift views are "binding" by default to their underlying table which means you can't drop that 50 underlying table without dropping the view first. This is a problem for us since we want to be able to 51 swap tables out from under views. Therefore we create the view as non-binding. 52 """ 53 if isinstance(query_or_df, DF_TYPES): 54 raise NotImplementedError( 55 "DataFrames are not supported for Redshift views because Redshift doesn't" 56 "support using `VALUES` in a `CREATE VIEW` statement." 57 ) 58 return super().create_view( 59 view_name, query_or_df, columns_to_types, replace, no_schema_binding=True 60 )
Redshift doesn't support VALUES
expressions outside of a INSERT
statement. Currently sqlglot cannot
performantly convert a values expression into a series of UNION ALL
statements. Therefore we just don't
support views for Redshift until sqlglot is updated to performantly support large union statements.
Also Redshift views are "binding" by default to their underlying table which means you can't drop that underlying table without dropping the view first. This is a problem for us since we want to be able to swap tables out from under views. Therefore we create the view as non-binding.
103 def replace_query( 104 self, 105 table_name: TableName, 106 query_or_df: QueryOrDF, 107 columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None, 108 ) -> None: 109 """ 110 Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially 111 handle DataFrame replacements. 112 113 If the table doesn't exist then we just create it and load it with insert statements 114 If it does exist then we need to do the: 115 `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...` dance. 116 """ 117 if not isinstance(query_or_df, pd.DataFrame): 118 return super().replace_query(table_name, query_or_df, columns_to_types) 119 if not columns_to_types: 120 raise ValueError("columns_to_types must be provided for dataframes") 121 target_table = exp.to_table(table_name) 122 target_exists = self.table_exists(target_table) 123 if target_exists: 124 with self.transaction(): 125 temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}" 126 temp_table = target_table.copy() 127 temp_table.set("this", exp.to_identifier(temp_table_name)) 128 old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}" 129 old_table = target_table.copy() 130 old_table.set("this", exp.to_identifier(old_table_name)) 131 self.create_table(temp_table, columns_to_types, exists=False) 132 for expression in self._pandas_to_sql( 133 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 134 ): 135 self._insert_append_query(temp_table, expression, columns_to_types) 136 self.rename_table(target_table, old_table) 137 self.rename_table(temp_table, target_table) 138 self.drop_table(old_table) 139 else: 140 self.create_table(target_table, columns_to_types, exists=False) 141 for expression in self._pandas_to_sql( 142 query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE 143 ): 144 self._insert_append_query(target_table, expression, columns_to_types)
Redshift doesn't support CREATE OR REPLACE TABLE...
with VALUES
expression so we need to specially
handle DataFrame replacements.
If the table doesn't exist then we just create it and load it with insert statements
If it does exist then we need to do the:
CREATE TABLE...
,INSERT INTO...
,RENAME TABLE...
,RENAME TABLE...
, DROP TABLE...` dance.
149 def table_exists(self, table_name: TableName) -> bool: 150 """ 151 Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table 152 exists. We don't use this directly because we still want all execution to go through our execute method 153 154 Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553 155 """ 156 table = exp.to_table(table_name) 157 158 # Redshift doesn't support catalog 159 if table.args.get("catalog"): 160 return False 161 162 q: str = ( 163 f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'" 164 ) 165 database_name = table.args.get("db") 166 if database_name: 167 q += f" AND table_schema = '{database_name}'" 168 169 self.execute(q) 170 171 result = self.cursor.fetchone() 172 173 return result[0] == 1 if result is not None else False
Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table exists. We don't use this directly because we still want all execution to go through our execute method
Inherited Members
- sqlmesh.core.engine_adapter.base.EngineAdapter
- EngineAdapter
- recycle
- close
- create_index
- create_table
- create_state_table
- create_table_like
- drop_table
- alter_table
- create_schema
- drop_schema
- drop_view
- columns
- delete_from
- insert_append
- insert_overwrite_by_time_partition
- update_table
- merge
- rename_table
- fetchone
- fetchall
- fetchdf
- fetch_pyspark_df
- transaction
- supports_transactions
- execute