Edit on GitHub

sqlmesh.core.engine_adapter.redshift

  1from __future__ import annotations
  2
  3import typing as t
  4import uuid
  5
  6import pandas as pd
  7from sqlglot import exp
  8
  9from sqlmesh.core.dialect import pandas_to_sql
 10from sqlmesh.core.engine_adapter._typing import DF_TYPES, Query
 11from sqlmesh.core.engine_adapter.base import EngineAdapter
 12from sqlmesh.core.engine_adapter.shared import DataObject
 13
 14if t.TYPE_CHECKING:
 15    from sqlmesh.core._typing import TableName
 16    from sqlmesh.core.engine_adapter._typing import QueryOrDF
 17
 18
 19class RedshiftEngineAdapter(EngineAdapter):
 20    DIALECT = "redshift"
 21    DEFAULT_BATCH_SIZE = 1000
 22
 23    @property
 24    def cursor(self) -> t.Any:
 25        connection = self._connection_pool.get()
 26        # The SQLMesh implementation relies on autocommit being set to True
 27        connection.autocommit = True
 28        cursor = self._connection_pool.get_cursor()
 29        # Redshift by default uses a `format` paramstyle that has issues when we try to write our snapshot
 30        # data to snapshot table. There doesn't seem to be a way to disable parameter overriding so we just
 31        # set it to `qmark` since that doesn't cause issues.
 32        cursor.paramstyle = "qmark"
 33        return cursor
 34
 35    def create_view(
 36        self,
 37        view_name: TableName,
 38        query_or_df: QueryOrDF,
 39        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 40        replace: bool = True,
 41        **create_kwargs: t.Any,
 42    ) -> None:
 43        """
 44        Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot
 45        performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't
 46        support views for Redshift until sqlglot is updated to performantly support large union statements.
 47
 48        Also Redshift views are "binding" by default to their underlying table which means you can't drop that
 49        underlying table without dropping the view first. This is a problem for us since we want to be able to
 50        swap tables out from under views. Therefore we create the view as non-binding.
 51        """
 52        if isinstance(query_or_df, DF_TYPES):
 53            raise NotImplementedError(
 54                "DataFrames are not supported for Redshift views because Redshift doesn't"
 55                "support using `VALUES` in a `CREATE VIEW` statement."
 56            )
 57        return super().create_view(
 58            view_name, query_or_df, columns_to_types, replace, no_schema_binding=True
 59        )
 60
 61    def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame:
 62        """Fetches a Pandas DataFrame from the cursor"""
 63        self.execute(query)
 64        return self.cursor.fetch_dataframe()
 65
 66    def _create_table_from_query(
 67        self,
 68        table_name: TableName,
 69        query: Query,
 70        exists: bool = True,
 71        **kwargs: t.Any,
 72    ) -> t.Optional[exp.Create]:
 73        """
 74        Redshift doesn't support `CREATE TABLE IF NOT EXISTS AS...` but does support `CREATE TABLE AS...` so
 75        we check if the exists check exists and if not then we can use the base implementation. Otherwise we
 76        manually check if it exists and if it does then this is a no-op anyways so we return and if it doesn't
 77        then we run the query with exists set to False since we just confirmed it doesn't exist.
 78        """
 79        if not exists:
 80            return super()._create_table_from_query(table_name, query, exists, **kwargs)
 81        if self.table_exists(table_name):
 82            return None
 83        return self._create_table_from_query(table_name, query, exists=False, **kwargs)
 84
 85    @classmethod
 86    def _pandas_to_sql(
 87        cls,
 88        df: pd.DataFrame,
 89        columns_to_types: t.Dict[str, exp.DataType],
 90        batch_size: int = 0,
 91        alias: str = "t",
 92    ) -> t.Generator[exp.Select, None, None]:
 93        """
 94        Extracts the `VALUES` expression from the SELECT statement and also removes the alias.
 95        """
 96        for expression in pandas_to_sql(df, columns_to_types, batch_size, alias):
 97            values_expression = t.cast(exp.Select, expression.find(exp.Values))
 98            values_expression.parent = None
 99            values_expression.set("alias", None)
100            yield values_expression
101
102    def replace_query(
103        self,
104        table_name: TableName,
105        query_or_df: QueryOrDF,
106        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
107    ) -> None:
108        """
109        Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially
110        handle DataFrame replacements.
111
112        If the table doesn't exist then we just create it and load it with insert statements
113        If it does exist then we need to do the:
114            `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...`  dance.
115        """
116        if not isinstance(query_or_df, pd.DataFrame):
117            return super().replace_query(table_name, query_or_df, columns_to_types)
118        if not columns_to_types:
119            raise ValueError("columns_to_types must be provided for dataframes")
120        target_table = exp.to_table(table_name)
121        target_exists = self.table_exists(target_table)
122        if target_exists:
123            with self.transaction():
124                temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}"
125                temp_table = target_table.copy()
126                temp_table.set("this", exp.to_identifier(temp_table_name))
127                old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}"
128                old_table = target_table.copy()
129                old_table.set("this", exp.to_identifier(old_table_name))
130                self.create_table(temp_table, columns_to_types, exists=False)
131                for expression in self._pandas_to_sql(
132                    query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
133                ):
134                    self._insert_append_query(temp_table, expression, columns_to_types)
135                self.rename_table(target_table, old_table)
136                self.rename_table(temp_table, target_table)
137                self.drop_table(old_table)
138        else:
139            self.create_table(target_table, columns_to_types, exists=False)
140            for expression in self._pandas_to_sql(
141                query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
142            ):
143                self._insert_append_query(target_table, expression, columns_to_types)
144
145    def _short_hash(self) -> str:
146        return uuid.uuid4().hex[:8]
147
148    def table_exists(self, table_name: TableName) -> bool:
149        """
150        Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table
151        exists. We don't use this directly because we still want all execution to go through our execute method
152
153        Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553
154        """
155        table = exp.to_table(table_name)
156
157        # Redshift doesn't support catalog
158        if table.args.get("catalog"):
159            return False
160
161        q: str = (
162            f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'"
163        )
164        database_name = table.args.get("db")
165        if database_name:
166            q += f" AND table_schema = '{database_name}'"
167
168        self.execute(q)
169
170        result = self.cursor.fetchone()
171
172        return result[0] == 1 if result is not None else False
173
174    def _get_data_objects(
175        self, schema_name: str, catalog_name: t.Optional[str] = None
176    ) -> t.List[DataObject]:
177        """
178        Returns all the data objects that exist in the given schema and optionally catalog.
179        """
180        catalog_name = f"'{catalog_name}'" if catalog_name else "NULL"
181        query = f"""
182            SELECT
183                {catalog_name} AS catalog_name,
184                tablename AS name,
185                schemaname AS schema_name,
186                'TABLE' AS type
187            FROM pg_tables
188            WHERE schemaname ILIKE '{schema_name}'
189            UNION ALL
190            SELECT
191                {catalog_name} AS catalog_name,
192                viewname AS name,
193                schemaname AS schema_name,
194                'VIEW' AS type
195            FROM pg_views
196            WHERE schemaname ILIKE '{schema_name}'
197        """
198        df = self.fetchdf(query)
199        return [
200            DataObject(
201                catalog=row.catalog_name, schema=row.schema_name, name=row.name, type=row.type  # type: ignore
202            )
203            for row in df.itertuples()
204        ]
class RedshiftEngineAdapter(sqlmesh.core.engine_adapter.base.EngineAdapter):
 20class RedshiftEngineAdapter(EngineAdapter):
 21    DIALECT = "redshift"
 22    DEFAULT_BATCH_SIZE = 1000
 23
 24    @property
 25    def cursor(self) -> t.Any:
 26        connection = self._connection_pool.get()
 27        # The SQLMesh implementation relies on autocommit being set to True
 28        connection.autocommit = True
 29        cursor = self._connection_pool.get_cursor()
 30        # Redshift by default uses a `format` paramstyle that has issues when we try to write our snapshot
 31        # data to snapshot table. There doesn't seem to be a way to disable parameter overriding so we just
 32        # set it to `qmark` since that doesn't cause issues.
 33        cursor.paramstyle = "qmark"
 34        return cursor
 35
 36    def create_view(
 37        self,
 38        view_name: TableName,
 39        query_or_df: QueryOrDF,
 40        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
 41        replace: bool = True,
 42        **create_kwargs: t.Any,
 43    ) -> None:
 44        """
 45        Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot
 46        performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't
 47        support views for Redshift until sqlglot is updated to performantly support large union statements.
 48
 49        Also Redshift views are "binding" by default to their underlying table which means you can't drop that
 50        underlying table without dropping the view first. This is a problem for us since we want to be able to
 51        swap tables out from under views. Therefore we create the view as non-binding.
 52        """
 53        if isinstance(query_or_df, DF_TYPES):
 54            raise NotImplementedError(
 55                "DataFrames are not supported for Redshift views because Redshift doesn't"
 56                "support using `VALUES` in a `CREATE VIEW` statement."
 57            )
 58        return super().create_view(
 59            view_name, query_or_df, columns_to_types, replace, no_schema_binding=True
 60        )
 61
 62    def _fetch_native_df(self, query: t.Union[exp.Expression, str]) -> pd.DataFrame:
 63        """Fetches a Pandas DataFrame from the cursor"""
 64        self.execute(query)
 65        return self.cursor.fetch_dataframe()
 66
 67    def _create_table_from_query(
 68        self,
 69        table_name: TableName,
 70        query: Query,
 71        exists: bool = True,
 72        **kwargs: t.Any,
 73    ) -> t.Optional[exp.Create]:
 74        """
 75        Redshift doesn't support `CREATE TABLE IF NOT EXISTS AS...` but does support `CREATE TABLE AS...` so
 76        we check if the exists check exists and if not then we can use the base implementation. Otherwise we
 77        manually check if it exists and if it does then this is a no-op anyways so we return and if it doesn't
 78        then we run the query with exists set to False since we just confirmed it doesn't exist.
 79        """
 80        if not exists:
 81            return super()._create_table_from_query(table_name, query, exists, **kwargs)
 82        if self.table_exists(table_name):
 83            return None
 84        return self._create_table_from_query(table_name, query, exists=False, **kwargs)
 85
 86    @classmethod
 87    def _pandas_to_sql(
 88        cls,
 89        df: pd.DataFrame,
 90        columns_to_types: t.Dict[str, exp.DataType],
 91        batch_size: int = 0,
 92        alias: str = "t",
 93    ) -> t.Generator[exp.Select, None, None]:
 94        """
 95        Extracts the `VALUES` expression from the SELECT statement and also removes the alias.
 96        """
 97        for expression in pandas_to_sql(df, columns_to_types, batch_size, alias):
 98            values_expression = t.cast(exp.Select, expression.find(exp.Values))
 99            values_expression.parent = None
100            values_expression.set("alias", None)
101            yield values_expression
102
103    def replace_query(
104        self,
105        table_name: TableName,
106        query_or_df: QueryOrDF,
107        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
108    ) -> None:
109        """
110        Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially
111        handle DataFrame replacements.
112
113        If the table doesn't exist then we just create it and load it with insert statements
114        If it does exist then we need to do the:
115            `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...`  dance.
116        """
117        if not isinstance(query_or_df, pd.DataFrame):
118            return super().replace_query(table_name, query_or_df, columns_to_types)
119        if not columns_to_types:
120            raise ValueError("columns_to_types must be provided for dataframes")
121        target_table = exp.to_table(table_name)
122        target_exists = self.table_exists(target_table)
123        if target_exists:
124            with self.transaction():
125                temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}"
126                temp_table = target_table.copy()
127                temp_table.set("this", exp.to_identifier(temp_table_name))
128                old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}"
129                old_table = target_table.copy()
130                old_table.set("this", exp.to_identifier(old_table_name))
131                self.create_table(temp_table, columns_to_types, exists=False)
132                for expression in self._pandas_to_sql(
133                    query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
134                ):
135                    self._insert_append_query(temp_table, expression, columns_to_types)
136                self.rename_table(target_table, old_table)
137                self.rename_table(temp_table, target_table)
138                self.drop_table(old_table)
139        else:
140            self.create_table(target_table, columns_to_types, exists=False)
141            for expression in self._pandas_to_sql(
142                query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
143            ):
144                self._insert_append_query(target_table, expression, columns_to_types)
145
146    def _short_hash(self) -> str:
147        return uuid.uuid4().hex[:8]
148
149    def table_exists(self, table_name: TableName) -> bool:
150        """
151        Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table
152        exists. We don't use this directly because we still want all execution to go through our execute method
153
154        Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553
155        """
156        table = exp.to_table(table_name)
157
158        # Redshift doesn't support catalog
159        if table.args.get("catalog"):
160            return False
161
162        q: str = (
163            f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'"
164        )
165        database_name = table.args.get("db")
166        if database_name:
167            q += f" AND table_schema = '{database_name}'"
168
169        self.execute(q)
170
171        result = self.cursor.fetchone()
172
173        return result[0] == 1 if result is not None else False
174
175    def _get_data_objects(
176        self, schema_name: str, catalog_name: t.Optional[str] = None
177    ) -> t.List[DataObject]:
178        """
179        Returns all the data objects that exist in the given schema and optionally catalog.
180        """
181        catalog_name = f"'{catalog_name}'" if catalog_name else "NULL"
182        query = f"""
183            SELECT
184                {catalog_name} AS catalog_name,
185                tablename AS name,
186                schemaname AS schema_name,
187                'TABLE' AS type
188            FROM pg_tables
189            WHERE schemaname ILIKE '{schema_name}'
190            UNION ALL
191            SELECT
192                {catalog_name} AS catalog_name,
193                viewname AS name,
194                schemaname AS schema_name,
195                'VIEW' AS type
196            FROM pg_views
197            WHERE schemaname ILIKE '{schema_name}'
198        """
199        df = self.fetchdf(query)
200        return [
201            DataObject(
202                catalog=row.catalog_name, schema=row.schema_name, name=row.name, type=row.type  # type: ignore
203            )
204            for row in df.itertuples()
205        ]

Base class wrapping a Database API compliant connection.

The EngineAdapter is an easily-subclassable interface that interacts with the underlying engine and data store.

Arguments:
  • connection_factory: a callable which produces a new Database API-compliant connection on every call.
  • dialect: The dialect with which this adapter is associated.
  • multithreaded: Indicates whether this adapter will be used by more than one thread.
def create_view( self, view_name: <MagicMock id='6006309696'>, query_or_df: <MagicMock id='6006311760'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None, replace: bool = True, **create_kwargs: Any) -> None:
36    def create_view(
37        self,
38        view_name: TableName,
39        query_or_df: QueryOrDF,
40        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
41        replace: bool = True,
42        **create_kwargs: t.Any,
43    ) -> None:
44        """
45        Redshift doesn't support `VALUES` expressions outside of a `INSERT` statement. Currently sqlglot cannot
46        performantly convert a values expression into a series of `UNION ALL` statements. Therefore we just don't
47        support views for Redshift until sqlglot is updated to performantly support large union statements.
48
49        Also Redshift views are "binding" by default to their underlying table which means you can't drop that
50        underlying table without dropping the view first. This is a problem for us since we want to be able to
51        swap tables out from under views. Therefore we create the view as non-binding.
52        """
53        if isinstance(query_or_df, DF_TYPES):
54            raise NotImplementedError(
55                "DataFrames are not supported for Redshift views because Redshift doesn't"
56                "support using `VALUES` in a `CREATE VIEW` statement."
57            )
58        return super().create_view(
59            view_name, query_or_df, columns_to_types, replace, no_schema_binding=True
60        )

Redshift doesn't support VALUES expressions outside of a INSERT statement. Currently sqlglot cannot performantly convert a values expression into a series of UNION ALL statements. Therefore we just don't support views for Redshift until sqlglot is updated to performantly support large union statements.

Also Redshift views are "binding" by default to their underlying table which means you can't drop that underlying table without dropping the view first. This is a problem for us since we want to be able to swap tables out from under views. Therefore we create the view as non-binding.

def replace_query( self, table_name: <MagicMock id='6006660624'>, query_or_df: <MagicMock id='6006632752'>, columns_to_types: Optional[Dict[str, sqlglot.expressions.DataType]] = None) -> None:
103    def replace_query(
104        self,
105        table_name: TableName,
106        query_or_df: QueryOrDF,
107        columns_to_types: t.Optional[t.Dict[str, exp.DataType]] = None,
108    ) -> None:
109        """
110        Redshift doesn't support `CREATE OR REPLACE TABLE...` with `VALUES` expression so we need to specially
111        handle DataFrame replacements.
112
113        If the table doesn't exist then we just create it and load it with insert statements
114        If it does exist then we need to do the:
115            `CREATE TABLE...`, `INSERT INTO...`, `RENAME TABLE...`, `RENAME TABLE...`, DROP TABLE...`  dance.
116        """
117        if not isinstance(query_or_df, pd.DataFrame):
118            return super().replace_query(table_name, query_or_df, columns_to_types)
119        if not columns_to_types:
120            raise ValueError("columns_to_types must be provided for dataframes")
121        target_table = exp.to_table(table_name)
122        target_exists = self.table_exists(target_table)
123        if target_exists:
124            with self.transaction():
125                temp_table_name = f"{target_table.alias_or_name}_temp_{self._short_hash()}"
126                temp_table = target_table.copy()
127                temp_table.set("this", exp.to_identifier(temp_table_name))
128                old_table_name = f"{target_table.alias_or_name}_old_{self._short_hash()}"
129                old_table = target_table.copy()
130                old_table.set("this", exp.to_identifier(old_table_name))
131                self.create_table(temp_table, columns_to_types, exists=False)
132                for expression in self._pandas_to_sql(
133                    query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
134                ):
135                    self._insert_append_query(temp_table, expression, columns_to_types)
136                self.rename_table(target_table, old_table)
137                self.rename_table(temp_table, target_table)
138                self.drop_table(old_table)
139        else:
140            self.create_table(target_table, columns_to_types, exists=False)
141            for expression in self._pandas_to_sql(
142                query_or_df, columns_to_types, self.DEFAULT_BATCH_SIZE
143            ):
144                self._insert_append_query(target_table, expression, columns_to_types)

Redshift doesn't support CREATE OR REPLACE TABLE... with VALUES expression so we need to specially handle DataFrame replacements.

If the table doesn't exist then we just create it and load it with insert statements

If it does exist then we need to do the:

CREATE TABLE..., INSERT INTO..., RENAME TABLE..., RENAME TABLE..., DROP TABLE...` dance.

def table_exists(self, table_name: <MagicMock id='6006623152'>) -> bool:
149    def table_exists(self, table_name: TableName) -> bool:
150        """
151        Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table
152        exists. We don't use this directly because we still want all execution to go through our execute method
153
154        Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553
155        """
156        table = exp.to_table(table_name)
157
158        # Redshift doesn't support catalog
159        if table.args.get("catalog"):
160            return False
161
162        q: str = (
163            f"SELECT 1 FROM information_schema.tables WHERE table_name = '{table.alias_or_name}'"
164        )
165        database_name = table.args.get("db")
166        if database_name:
167            q += f" AND table_schema = '{database_name}'"
168
169        self.execute(q)
170
171        result = self.cursor.fetchone()
172
173        return result[0] == 1 if result is not None else False

Redshift doesn't support describe so I'm using what the redshift cursor does to check if a table exists. We don't use this directly because we still want all execution to go through our execute method

Reference: https://github.com/aws/amazon-redshift-python-driver/blob/master/redshift_connector/cursor.py#L528-L553