Source code for tests.system.google.cloud.bigquery.example_bigquery_to_postgres
## Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License."""Example Airflow DAG for Google BigQuery service.This DAG relies on the following OS environment variables* AIRFLOW__API__GOOGLE_KEY_PATH - Path to service account key file. Note, you can skip this variable if you run this DAG in a Composer environment."""from__future__importannotationsimportloggingimportosfromdatetimeimportdatetimefrompendulumimportdurationfromairflow.decoratorsimporttaskfromairflow.modelsimportConnectionfromairflow.models.dagimportDAGfromairflow.providers.common.sql.operators.sqlimportSQLExecuteQueryOperatorfromairflow.providers.google.cloud.hooks.computeimportComputeEngineHookfromairflow.providers.google.cloud.hooks.compute_sshimportComputeEngineSSHHookfromairflow.providers.google.cloud.operators.bigqueryimport(BigQueryCreateEmptyDatasetOperator,BigQueryCreateTableOperator,BigQueryDeleteDatasetOperator,)fromairflow.providers.google.cloud.operators.computeimport(ComputeEngineDeleteInstanceOperator,ComputeEngineInsertInstanceOperator,)fromairflow.providers.google.cloud.transfers.bigquery_to_postgresimportBigQueryToPostgresOperatorfromairflow.providers.ssh.operators.sshimportSSHOperatorfromairflow.providers.standard.operators.bashimportBashOperatorfromairflow.settingsimportSessionfromairflow.utils.trigger_ruleimportTriggerRule
[docs]GCE_INSTANCE_BODY={"name":GCE_INSTANCE_NAME,"machine_type":f"zones/{ZONE}/machineTypes/{GCE_MACHINE_TYPE}","disks":[{"boot":True,"device_name":GCE_INSTANCE_NAME,"initialize_params":{"disk_size_gb":"10","disk_type":f"zones/{ZONE}/diskTypes/pd-balanced",# The source image can become outdated and stop being supported by apt software packages.# In that case the image version will need to be updated."source_image":"projects/debian-cloud/global/images/debian-12-bookworm-v20240611",},}],"network_interfaces":[{"access_configs":[{"name":"External NAT","network_tier":"PREMIUM"}],"stack_type":"IPV4_ONLY","subnetwork":f"regions/{REGION}/subnetworks/default",}],}
withDAG(DAG_ID,schedule="@once",# Override to match your needsstart_date=datetime(2021,1,1),catchup=False,tags=["example","bigquery","postgres"],)asdag:
create_bigquery_table=BigQueryCreateTableOperator(task_id="create_bigquery_table",dataset_id=BIGQUERY_DATASET_NAME,table_id=BIGQUERY_TABLE,table_resource={"schema":{"fields":SCHEMA},},)insert_bigquery_data=BashOperator(task_id="insert_bigquery_data",bash_command=UPLOAD_DATA_TO_BIGQUERY,)create_gce_instance=ComputeEngineInsertInstanceOperator(task_id="create_gce_instance",project_id=PROJECT_ID,zone=ZONE,body=GCE_INSTANCE_BODY,)create_firewall_rule=BashOperator(task_id="create_firewall_rule",bash_command=CREATE_FIREWALL_RULE_COMMAND,)setup_postgres=SSHOperator(task_id="setup_postgres",ssh_hook=ComputeEngineSSHHook(user="username",instance_name=GCE_INSTANCE_NAME,zone=ZONE,project_id=PROJECT_ID,use_oslogin=False,use_iap_tunnel=False,cmd_timeout=180,),command=SETUP_POSTGRES_COMMAND,retries=4,)@taskdefget_public_ip()->str:hook=ComputeEngineHook()address=hook.get_instance_address(resource_id=GCE_INSTANCE_NAME,zone=ZONE,project_id=PROJECT_ID)returnaddressget_public_ip_task=get_public_ip()@taskdefcreate_connection(connection_id:str,ip_address:str)->None:connection=Connection(conn_id=connection_id,description="Example connection",conn_type=CONNECTION_TYPE,host=ip_address,schema=DB_NAME,login=DB_USER_NAME,password=DB_USER_PASSWORD,port=DB_PORT,)session=Session()log.info("Removing connection %s if it exists",connection_id)query=session.query(Connection).filter(Connection.conn_id==connection_id)query.delete()session.add(connection)session.commit()log.info("Connection %s created",connection_id)create_connection_task=create_connection(connection_id=CONNECTION_ID,ip_address=get_public_ip_task)create_pg_table=SQLExecuteQueryOperator(task_id="create_pg_table",conn_id=CONNECTION_ID,sql=SQL_CREATE_TABLE,retries=4,retry_delay=duration(seconds=20),retry_exponential_backoff=False,)# [START howto_operator_bigquery_to_postgres]bigquery_to_postgres=BigQueryToPostgresOperator(task_id="bigquery_to_postgres",postgres_conn_id=CONNECTION_ID,dataset_table=f"{BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE}",target_table_name=SQL_TABLE,batch_size=BATCH_SIZE,replace=False,)# [END howto_operator_bigquery_to_postgres]update_pg_table_data=SQLExecuteQueryOperator(task_id="update_pg_table_data",conn_id=CONNECTION_ID,sql=f"UPDATE {SQL_TABLE} SET salary = salary + 0.5 WHERE salary < 10000.0",retries=4,retry_delay=duration(seconds=20),retry_exponential_backoff=False,)create_unique_index_in_pg_table=SQLExecuteQueryOperator(task_id="create_unique_index_in_pg_table",conn_id=CONNECTION_ID,sql=f"CREATE UNIQUE INDEX emp_salary ON {SQL_TABLE}(emp_name, salary);",retries=4,retry_delay=duration(seconds=20),retry_exponential_backoff=False,show_return_value_in_logs=True,)# [START howto_operator_bigquery_to_postgres_upsert]bigquery_to_postgres_upsert=BigQueryToPostgresOperator(task_id="bigquery_to_postgres_upsert",postgres_conn_id=CONNECTION_ID,dataset_table=f"{BIGQUERY_DATASET_NAME}.{BIGQUERY_TABLE}",target_table_name=SQL_TABLE,batch_size=BATCH_SIZE,replace=True,selected_fields=["emp_name","salary"],replace_index=["emp_name","salary"],)# [END howto_operator_bigquery_to_postgres_upsert]delete_bigquery_dataset=BigQueryDeleteDatasetOperator(task_id="delete_bigquery_dataset",dataset_id=BIGQUERY_DATASET_NAME,delete_contents=True,trigger_rule=TriggerRule.ALL_DONE,)delete_firewall_rule=BashOperator(task_id="delete_firewall_rule",bash_command=DELETE_FIREWALL_RULE_COMMAND,trigger_rule=TriggerRule.ALL_DONE,)delete_gce_instance=ComputeEngineDeleteInstanceOperator(task_id="delete_gce_instance",resource_id=GCE_INSTANCE_NAME,zone=ZONE,project_id=PROJECT_ID,trigger_rule=TriggerRule.ALL_DONE,)delete_persistent_disk=BashOperator(task_id="delete_persistent_disk",bash_command=DELETE_PERSISTENT_DISK_COMMAND,trigger_rule=TriggerRule.ALL_DONE,)@task(task_id="delete_connection")defdelete_connection(connection_id:str)->None:session=Session()log.info("Removing connection %s",connection_id)query=session.query(Connection).filter(Connection.conn_id==connection_id)query.delete()session.commit()delete_connection_task=delete_connection(connection_id=CONNECTION_ID)(# TEST SETUPcreate_gce_instance>>create_bigquery_dataset>>create_bigquery_table>>insert_bigquery_data>>get_public_ip_task>>create_connection_task>>create_firewall_rule>>setup_postgres>>create_pg_table# TEST BODY>>bigquery_to_postgres>>update_pg_table_data>>create_unique_index_in_pg_table>>bigquery_to_postgres_upsert# TEST TEARDOWN>>[delete_bigquery_dataset,delete_firewall_rule,delete_gce_instance,delete_connection_task,]>>delete_persistent_disk)fromtests_common.test_utils.watcherimportwatcher# This test needs watcher in order to properly mark success/failure# when "tearDown" task with trigger rule is part of the DAGlist(dag.tasks)>>watcher()fromtests_common.test_utils.system_testsimportget_test_run# noqa: E402# Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)