Source code for airflow.providers.dbt.cloud.utils.openlineage
# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.from__future__importannotationsimportasyncioimportloggingimportrefromtypingimportTYPE_CHECKINGfromairflow.providers.common.compat.openlineage.checkimportrequire_openlineage_versionfromairflow.providers.dbt.cloud.version_compatimportAIRFLOW_V_2_10_PLUS,AIRFLOW_V_3_0_PLUSifTYPE_CHECKING:fromairflow.models.taskinstanceimportTaskInstancefromairflow.providers.dbt.cloud.operators.dbtimportDbtCloudRunJobOperatorfromairflow.providers.dbt.cloud.sensors.dbtimportDbtCloudJobRunSensorfromairflow.providers.openlineage.extractors.baseimportOperatorLineage
def_get_logical_date(task_instance):# todo: remove when min airflow version >= 3.0ifAIRFLOW_V_3_0_PLUS:dagrun=task_instance.get_template_context()["dag_run"]returndagrun.logical_dateordagrun.run_afterifhasattr(task_instance,"logical_date"):date=task_instance.logical_dateelse:date=task_instance.execution_datereturndatedef_get_try_number(val):# todo: remove when min airflow version >= 2.10.0ifAIRFLOW_V_2_10_PLUS:returnval.try_numberreturnval.try_number-1@require_openlineage_version(provider_min_version="2.0.0")
[docs]defgenerate_openlineage_events_from_dbt_cloud_run(operator:DbtCloudRunJobOperator|DbtCloudJobRunSensor,task_instance:TaskInstance)->OperatorLineage:""" Generate OpenLineage events from the DBT Cloud run. This function retrieves information about a DBT Cloud run, including the associated job, project, and execution details. It processes the run's artifacts, such as the manifest and run results, in parallel for many steps. Then it generates and emits OpenLineage events based on the executed DBT tasks. :param operator: Instance of DBT Cloud operator that executed DBT tasks. It already should have run_id and dbt cloud hook. :param task_instance: Currently executed task instance :return: An empty OperatorLineage object indicating the completion of events generation. """fromopenlineage.common.provider.dbtimportDbtCloudArtifactProcessor,ParentRunMetadatafromairflow.providers.openlineage.confimportnamespacefromairflow.providers.openlineage.extractorsimportOperatorLineagefromairflow.providers.openlineage.plugins.adapterimport(_PRODUCER,OpenLineageAdapter,)fromairflow.providers.openlineage.plugins.listenerimportget_openlineage_listener# if no account_id set this will fallbacklog.debug("Retrieving information about DBT job run.")job_run=operator.hook.get_job_run(run_id=operator.run_id,account_id=operator.account_id,include_related=["run_steps,job"]).json()["data"]job=job_run["job"]# retrieve account_id from job and use that starting from this lineaccount_id=job["account_id"]project=operator.hook.get_project(project_id=job["project_id"],account_id=account_id).json()["data"]connection=project["connection"]execute_steps=job["execute_steps"]run_steps=job_run["run_steps"]log.debug("Filtering only DBT invocation steps for further processing.")# filter only dbt invocation stepssteps=[]forrun_stepinrun_steps:name=run_step["name"]ifname.startswith("Invoke dbt with `"):regex_pattern="Invoke dbt with `([^`.]*)`"m=re.search(regex_pattern,name)ifmandm.group(1)inexecute_steps:steps.append(run_step["index"])# catalog is available only if docs are generatedcatalog=Nonetry:log.debug("Retrieving information about catalog artifact from DBT.")catalog=operator.hook.get_job_run_artifact(operator.run_id,path="catalog.json").json()["data"]exceptException:# type: ignorelog.info("Openlineage could not find DBT catalog artifact, usually available when docs are generated.""Proceeding with metadata extraction. ""If you see error logs above about `HTTP error: Not Found` it's safe to ignore them.")asyncdefget_artifacts_for_steps(steps,artifacts):"""Get artifacts for a list of steps concurrently."""tasks=[operator.hook.get_job_run_artifacts_concurrently(run_id=operator.run_id,account_id=account_id,step=step,artifacts=artifacts,)forstepinsteps]returnawaitasyncio.gather(*tasks)# get artifacts for steps concurrentlylog.debug("Retrieving information about artifacts for all job steps from DBT.")step_artifacts=asyncio.run(get_artifacts_for_steps(steps=steps,artifacts=["manifest.json","run_results.json"]))log.debug("Preparing OpenLineage parent job information to be included in DBT events.")# generate same run id of current task instanceparent_run_id=OpenLineageAdapter.build_task_instance_run_id(dag_id=task_instance.dag_id,task_id=operator.task_id,logical_date=_get_logical_date(task_instance),try_number=_get_try_number(task_instance),map_index=task_instance.map_index,)parent_job=ParentRunMetadata(run_id=parent_run_id,job_name=f"{task_instance.dag_id}.{task_instance.task_id}",job_namespace=namespace(),)client=get_openlineage_listener().adapter.get_or_create_openlineage_client()# process each step in loop, sending generated events in the same order as stepsforcounter,artifactsinenumerate(step_artifacts,1):log.debug("Parsing information about artifact no. %s.",counter)# process manifestmanifest=artifacts["manifest.json"]ifnotartifacts.get("run_results.json",None):log.debug("No run results found for artifact no. %s. Skipping.",counter)continueprocessor=DbtCloudArtifactProcessor(producer=_PRODUCER,job_namespace=namespace(),skip_errors=False,logger=operator.log,manifest=manifest,run_result=artifacts["run_results.json"],profile=connection,catalog=catalog,)processor.dbt_run_metadata=parent_jobevents=processor.parse().events()log.debug("Found %s OpenLineage events for artifact no. %s.",len(events),counter)foreventinevents:client.emit(event=event)log.debug("Emitted all OpenLineage events for artifact no. %s.",counter)log.info("OpenLineage has successfully finished processing information about DBT job run.")returnOperatorLineage()