Source code for airflow.providers.common.sql.example_dags.example_analytics
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations
import datetime
from airflow.providers.common.sql.config import DataSourceConfig
from airflow.providers.common.sql.operators.analytics import AnalyticsOperator
from airflow.sdk import DAG, task
[docs]
datasource_config_s3 = DataSourceConfig(
conn_id="aws_default", table_name="users_data", uri="s3://bucket/path/", format="parquet"
)
[docs]
datasource_config_local = DataSourceConfig(
conn_id="", table_name="users_data", uri="file:///path/to/", format="parquet"
)
[docs]
datasource_config_iceberg = DataSourceConfig(
conn_id="iceberg_default",
table_name="users_data",
db_name="demo", # will be used to load table via pyiceberg eg: demo.users_data
format="iceberg",
)
"""
For example when working iceberg with glue catalog provide the following format for iceberg connection extras:
{
"client.access-key-id": "<>",
"client.secret-access-key": "<>",
'client.region': '<region>',
"type": "glue",
"uri": "https://glue.<region>.amazonaws.com/iceberg",
}
"""
# Please replace uri with appropriate value
with DAG(
dag_id="example_analytics",
schedule=datetime.timedelta(hours=4),
start_date=datetime.datetime(2021, 1, 1),
catchup=False,
tags=["analytics", "common-sql"],
) as dag:
# [START howto_analytics_operator_with_s3]
[docs]
analytics_with_s3 = AnalyticsOperator(
task_id="analytics_with_s3",
datasource_configs=[datasource_config_s3],
queries=["SELECT * FROM users_data", "SELECT count(*) FROM users_data"],
)
# [END howto_analytics_operator_with_s3]
# [START howto_analytics_operator_with_local]
analytics_with_local = AnalyticsOperator(
task_id="analytics_with_local",
datasource_configs=[datasource_config_local],
queries=["SELECT * FROM users_data", "SELECT count(*) FROM users_data"],
)
analytics_with_s3 >> analytics_with_local
# [END howto_analytics_operator_with_local]
# [START howto_analytics_decorator]
@task.analytics(datasource_configs=[datasource_config_s3])
def get_user_summary_queries():
return ["SELECT * FROM users_data LIMIT 10", "SELECT count(*) FROM users_data"]
# [END howto_analytics_decorator]
# [START howto_analytics_iceberg]
@task.analytics(datasource_configs=[datasource_config_iceberg])
def get_users_product_queries_from_iceberg_catalog():
return ["SELECT * FROM users_data LIMIT 10", "SELECT count(*) FROM users_data"]
# [END howto_analytics_iceberg]
analytics_with_local >> get_user_summary_queries() >> get_users_product_queries_from_iceberg_catalog()