Source code for tests.system.amazon.aws.example_s3_to_sql
# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.from__future__importannotationsfromdatetimeimportdatetimefromairflowimportsettingsfromairflow.decoratorsimporttaskfromairflow.modelsimportConnectionfromairflow.models.baseoperatorimportchainfromairflow.models.dagimportDAGfromairflow.providers.amazon.aws.hooks.redshift_clusterimportRedshiftHookfromairflow.providers.amazon.aws.operators.redshift_clusterimport(RedshiftCreateClusterOperator,RedshiftDeleteClusterOperator,)fromairflow.providers.amazon.aws.operators.redshift_dataimportRedshiftDataOperatorfromairflow.providers.amazon.aws.operators.s3import(S3CreateBucketOperator,S3CreateObjectOperator,S3DeleteBucketOperator,S3DeleteObjectsOperator,)fromairflow.providers.amazon.aws.sensors.redshift_clusterimportRedshiftClusterSensorfromairflow.providers.amazon.aws.transfers.s3_to_sqlimportS3ToSqlOperatorfromairflow.providers.common.sql.operators.sqlimportSQLTableCheckOperatorfromairflow.utils.trigger_ruleimportTriggerRulefromproviders.tests.system.amazon.aws.utilsimportENV_ID_KEY,SystemTestContextBuilderfromtests_common.test_utils.watcherimportwatcher# Externally fetched variables:
env_id=test_context[ENV_ID_KEY]security_group_id=test_context[SECURITY_GROUP_KEY]cluster_subnet_group_name=test_context[CLUSTER_SUBNET_GROUP_KEY]conn_id_name=f"{env_id}-conn-id"redshift_cluster_identifier=f"{env_id}-redshift-cluster"sg_name=f"{env_id}-sg"s3_bucket_name=f"{env_id}-bucket"s3_key=f"{env_id}/files/cocktail_list.csv"create_cluster=RedshiftCreateClusterOperator(task_id="create_cluster",cluster_identifier=redshift_cluster_identifier,vpc_security_group_ids=[security_group_id],cluster_subnet_group_name=cluster_subnet_group_name,cluster_type="single-node",node_type="dc2.large",master_username=DB_LOGIN,master_user_password=DB_PASS,)wait_cluster_available=RedshiftClusterSensor(task_id="wait_cluster_available",cluster_identifier=redshift_cluster_identifier,target_status="available",poke_interval=5,timeout=60*30,)set_up_connection=create_connection(conn_id_name,cluster_id=redshift_cluster_identifier)create_bucket=S3CreateBucketOperator(task_id="create_bucket",bucket_name=s3_bucket_name,)create_object=S3CreateObjectOperator(task_id="create_object",s3_bucket=s3_bucket_name,s3_key=s3_key,data=SAMPLE_DATA,replace=True,)create_table=RedshiftDataOperator(task_id="create_sample_table",cluster_identifier=redshift_cluster_identifier,database=DB_NAME,db_user=DB_LOGIN,sql=f""" CREATE TABLE IF NOT EXISTS {SQL_TABLE_NAME} ( cocktail_id INT NOT NULL, cocktail_name VARCHAR NOT NULL, base_spirit VARCHAR NOT NULL); """,wait_for_completion=True,)# [START howto_transfer_s3_to_sql]## This operator requires a parser method. The Parser should take a filename as input# and return an iterable of rows.# This example parser uses the builtin csv library and returns a list of rows#defparse_csv_to_list(filepath):importcsvwithopen(filepath,newline="")asfile:returnlist(csv.reader(file))transfer_s3_to_sql=S3ToSqlOperator(task_id="transfer_s3_to_sql",s3_bucket=s3_bucket_name,s3_key=s3_key,table=SQL_TABLE_NAME,column_list=SQL_COLUMN_LIST,parser=parse_csv_to_list,sql_conn_id=conn_id_name,)# [END howto_transfer_s3_to_sql]# [START howto_transfer_s3_to_sql_generator]## As the parser can return any kind of iterator, a generator is also allowed.# This example parser returns a generator which prevents python from loading# the whole file into memory.#defparse_csv_to_generator(filepath):importcsvwithopen(filepath,newline="")asfile:yield fromcsv.reader(file)transfer_s3_to_sql_generator=S3ToSqlOperator(task_id="transfer_s3_to_sql_paser_to_generator",s3_bucket=s3_bucket_name,s3_key=s3_key,table=SQL_TABLE_NAME,column_list=SQL_COLUMN_LIST,parser=parse_csv_to_generator,sql_conn_id=conn_id_name,)# [END howto_transfer_s3_to_sql_generator]check_table=SQLTableCheckOperator(task_id="check_table",conn_id=conn_id_name,table=SQL_TABLE_NAME,checks={"row_count_check":{"check_statement":"COUNT(*) = 6"},},)drop_table=RedshiftDataOperator(task_id="drop_table",cluster_identifier=redshift_cluster_identifier,database=DB_NAME,db_user=DB_LOGIN,sql=f"DROP TABLE {SQL_TABLE_NAME}",wait_for_completion=True,)delete_s3_objects=S3DeleteObjectsOperator(trigger_rule=TriggerRule.ALL_DONE,task_id="delete_objects",bucket=s3_bucket_name,keys=s3_key,)delete_s3_bucket=S3DeleteBucketOperator(trigger_rule=TriggerRule.ALL_DONE,task_id="delete_bucket",bucket_name=s3_bucket_name,force_delete=True,)delete_cluster=RedshiftDeleteClusterOperator(task_id="delete_cluster",cluster_identifier=redshift_cluster_identifier,trigger_rule=TriggerRule.ALL_DONE,)chain(# TEST SETUPtest_context,create_cluster,wait_cluster_available,set_up_connection,create_bucket,create_object,create_table,# TEST BODYtransfer_s3_to_sql,transfer_s3_to_sql_generator,check_table,# TEST TEARDOWNdrop_table,delete_s3_objects,delete_s3_bucket,delete_cluster,)list(dag.tasks)>>watcher()fromtests_common.test_utils.system_testsimportget_test_run# noqa: E402# Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)