Source code for airflow.providers.google.cloud.hooks.datapipeline

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""This module contains a Google Data Pipelines Hook."""

from __future__ import annotations

from typing import Sequence

from googleapiclient.discovery import build

from airflow.providers.google.common.hooks.base_google import (
    GoogleBaseHook,
)

[docs]DEFAULT_DATAPIPELINE_LOCATION = "us-central1"


[docs]class DataPipelineHook(GoogleBaseHook):
    """
    Hook for Google Data Pipelines.

    All the methods in the hook where project_id is used must be called with
    keyword arguments rather than positional.
    """

    def __init__(
        self,
        gcp_conn_id: str = "google_cloud_default",
        impersonation_chain: str | Sequence[str] | None = None,
        **kwargs,
    ) -> None:
        super().__init__(
            gcp_conn_id=gcp_conn_id,
            impersonation_chain=impersonation_chain,
        )

[docs]    def get_conn(self) -> build:
        """Return a Google Cloud Data Pipelines service object."""
        http_authorized = self._authorize()
        return build("datapipelines", "v1", http=http_authorized, cache_discovery=False)

    @GoogleBaseHook.fallback_to_default_project_id
[docs]    def create_data_pipeline(
        self,
        body: dict,
        project_id: str,
        location: str = DEFAULT_DATAPIPELINE_LOCATION,
    ) -> None:
        """
        Create a new Data Pipelines instance from the Data Pipelines API.

        :param body: The request body (contains instance of Pipeline). See:
            https://cloud.google.com/dataflow/docs/reference/data-pipelines/rest/v1/projects.locations.pipelines/create#request-body
        :param project_id: The ID of the GCP project that owns the job.
        :param location: The location to direct the Data Pipelines instance to (for example us-central1).

        Returns the created Data Pipelines instance in JSON representation.
        """
        parent = self.build_parent_name(project_id, location)
        service = self.get_conn()
        self.log.info(dir(service.projects().locations()))
        request = (
            service.projects()
            .locations()
            .pipelines()
            .create(
                parent=parent,
                body=body,
            )
        )
        response = request.execute(num_retries=self.num_retries)
        return response

    @GoogleBaseHook.fallback_to_default_project_id
[docs]    def run_data_pipeline(
        self,
        data_pipeline_name: str,
        project_id: str,
        location: str = DEFAULT_DATAPIPELINE_LOCATION,
    ) -> None:
        """
        Run a Data Pipelines Instance using the Data Pipelines API.

        :param data_pipeline_name:  The display name of the pipeline. In example
            projects/PROJECT_ID/locations/LOCATION_ID/pipelines/PIPELINE_ID it would be the PIPELINE_ID.
        :param project_id: The ID of the GCP project that owns the job.
        :param location: The location to direct the Data Pipelines instance to (for example us-central1).

        Returns the created Job in JSON representation.
        """
        parent = self.build_parent_name(project_id, location)
        service = self.get_conn()
        request = (
            service.projects()
            .locations()
            .pipelines()
            .run(
                name=f"{parent}/pipelines/{data_pipeline_name}",
                body={},
            )
        )
        response = request.execute(num_retries=self.num_retries)
        return response

    @staticmethod
[docs]    def build_parent_name(project_id: str, location: str):
        return f"projects/{project_id}/locations/{location}"