Source code for airflow.providers.amazon.aws.hooks.redshift_sql
# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.from__future__importannotationsfromfunctoolsimportcached_propertyfromtypingimportTYPE_CHECKINGimportredshift_connectorfromredshift_connectorimportConnectionasRedshiftConnectionfromsqlalchemyimportcreate_enginefromsqlalchemy.engine.urlimportURLfromairflow.exceptionsimportAirflowExceptionfromairflow.providers.amazon.aws.hooks.base_awsimportAwsBaseHookfromairflow.providers.amazon.version_compatimportAIRFLOW_V_2_10_PLUSfromairflow.providers.common.sql.hooks.sqlimportDbApiHookifTYPE_CHECKING:fromairflow.models.connectionimportConnectionfromairflow.providers.openlineage.sqlparserimportDatabaseInfo
[docs]classRedshiftSQLHook(DbApiHook):""" Execute statements against Amazon Redshift. This hook requires the redshift_conn_id connection. Note: For AWS IAM authentication, use iam in the extra connection parameters and set it to true. Leave the password field empty. This will use the "aws_default" connection to get the temporary token unless you override with aws_conn_id when initializing the hook. The cluster-identifier is extracted from the beginning of the host field, so is optional. It can however be overridden in the extra field. extras example: ``{"iam":true}`` :param redshift_conn_id: reference to :ref:`Amazon Redshift connection id<howto/connection:redshift>` .. note:: get_sqlalchemy_engine() and get_uri() depend on sqlalchemy-amazon-redshift """
[docs]defget_ui_field_behaviour(cls)->dict:"""Get custom field behavior."""return{"hidden_fields":[],"relabeling":{"login":"User","schema":"Database"},}
[docs]defget_iam_token(self,conn:Connection)->tuple[str,str,int]:""" Retrieve a temporary password to connect to Redshift. Port is required. If none is provided, default is used for each service. """port=conn.portor5439is_serverless=conn.extra_dejson.get("is_serverless",False)ifis_serverless:serverless_work_group=conn.extra_dejson.get("serverless_work_group")ifnotserverless_work_group:raiseAirflowException("Please set serverless_work_group in redshift connection to use IAM with"" Redshift Serverless.")serverless_token_duration_seconds=conn.extra_dejson.get("serverless_token_duration_seconds",3600)redshift_serverless_client=AwsBaseHook(aws_conn_id=self.aws_conn_id,client_type="redshift-serverless").conn# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift-serverless/client/get_credentials.html#get-credentialscluster_creds=redshift_serverless_client.get_credentials(dbName=conn.schema,workgroupName=serverless_work_group,durationSeconds=serverless_token_duration_seconds,)token=cluster_creds["dbPassword"]login=cluster_creds["dbUser"]else:# Pull the custer-identifier from the beginning of the Redshift URL# ex. my-cluster.ccdre4hpd39h.us-east-1.redshift.amazonaws.com returns my-clustercluster_identifier=conn.extra_dejson.get("cluster_identifier")ifnotcluster_identifier:ifconn.host:cluster_identifier=conn.host.split(".",1)[0]else:raiseAirflowException("Please set cluster_identifier or host in redshift connection.")redshift_client=AwsBaseHook(aws_conn_id=self.aws_conn_id,client_type="redshift").conn# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.get_cluster_credentialscluster_creds=redshift_client.get_cluster_credentials(DbUser=conn.login,DbName=conn.schema,ClusterIdentifier=cluster_identifier,AutoCreate=False,)token=cluster_creds["DbPassword"]login=cluster_creds["DbUser"]returnlogin,token,port
[docs]defget_uri(self)->str:"""Overridden to use the Redshift dialect as driver name."""conn_params=self._get_conn_params()if"user"inconn_params:conn_params["username"]=conn_params.pop("user")# Compatibility: The 'create' factory method was added in SQLAlchemy 1.4# to replace calling the default URL constructor directly.create_url=getattr(URL,"create",URL)returnstr(create_url(drivername="postgresql",**conn_params))
[docs]defget_sqlalchemy_engine(self,engine_kwargs=None):"""Overridden to pass Redshift-specific arguments."""conn_kwargs=self.conn.extra_dejsonifengine_kwargsisNone:engine_kwargs={}if"connect_args"inengine_kwargs:engine_kwargs["connect_args"]={**conn_kwargs,**engine_kwargs["connect_args"]}else:engine_kwargs["connect_args"]=conn_kwargsreturncreate_engine(self.get_uri(),**engine_kwargs)
[docs]defget_table_primary_key(self,table:str,schema:str|None="public")->list[str]|None:""" Get the table's primary key. :param table: Name of the target table :param schema: Name of the target schema, public by default :return: Primary key columns list """sql=""" select kcu.column_name from information_schema.table_constraints tco join information_schema.key_column_usage kcu on kcu.constraint_name = tco.constraint_name and kcu.constraint_schema = tco.constraint_schema and kcu.constraint_name = tco.constraint_name where tco.constraint_type = 'PRIMARY KEY' and kcu.table_schema = %s and kcu.table_name = %s """pk_columns=[row[0]forrowinself.get_records(sql,(schema,table))]returnpk_columnsorNone
[docs]defget_conn(self)->RedshiftConnection:"""Get a ``redshift_connector.Connection`` object."""conn_params=self._get_conn_params()conn_kwargs_dejson=self.conn.extra_dejsonconn_kwargs:dict={**conn_params,**conn_kwargs_dejson}returnredshift_connector.connect(**conn_kwargs)
[docs]defget_openlineage_database_info(self,connection:Connection)->DatabaseInfo:"""Return Redshift specific information for OpenLineage."""fromairflow.providers.openlineage.sqlparserimportDatabaseInfoauthority=self._get_openlineage_redshift_authority_part(connection)returnDatabaseInfo(scheme="redshift",authority=authority,database=connection.schema,information_schema_table_name="SVV_REDSHIFT_COLUMNS",information_schema_columns=["schema_name","table_name","column_name","ordinal_position","data_type","database_name",],is_information_schema_cross_db=True,use_flat_cross_db_query=True,)
def_get_openlineage_redshift_authority_part(self,connection:Connection)->str:fromairflow.providers.amazon.aws.hooks.base_awsimportAwsBaseHookport=connection.portor5439cluster_identifier=Noneifconnection.extra_dejson.get("iam",False):cluster_identifier=connection.extra_dejson.get("cluster_identifier")region_name=AwsBaseHook(aws_conn_id=self.aws_conn_id).region_nameidentifier=f"{cluster_identifier}.{region_name}"ifnotcluster_identifier:identifier=self._get_identifier_from_hostname(connection.host)returnf"{identifier}:{port}"def_get_identifier_from_hostname(self,hostname:str)->str:parts=hostname.split(".")ifhostname.endswith("amazonaws.com")andlen(parts)==6:returnf"{parts[0]}.{parts[2]}"else:self.log.debug("""Could not parse identifier from hostname '%s'. You are probably using IP to connect to Redshift cluster. Expected format: 'cluster_identifier.id.region_name.redshift.amazonaws.com' Falling back to whole hostname.""",hostname,)returnhostname
[docs]defget_openlineage_default_schema(self)->str|None:"""Return current schema. This is usually changed with ``SEARCH_PATH`` parameter."""ifAIRFLOW_V_2_10_PLUS:returnself.get_first("SELECT CURRENT_SCHEMA();")[0]returnsuper().get_openlineage_default_schema()