Source code for airflow.providers.google.cloud.hooks.dataform

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import time
from typing import TYPE_CHECKING, Sequence

from google.api_core.gapic_v1.method import DEFAULT, _MethodDefault
from google.cloud.dataform_v1beta1 import DataformClient
from google.cloud.dataform_v1beta1.types import (
    CompilationResult,
    InstallNpmPackagesResponse,
    Repository,
    WorkflowInvocation,
    Workspace,
    WriteFileResponse,
)

from airflow.exceptions import AirflowException
from airflow.providers.google.common.hooks.base_google import GoogleBaseHook

if TYPE_CHECKING:
    from google.api_core.retry import Retry
    from google.cloud.dataform_v1beta1.services.dataform.pagers import QueryWorkflowInvocationActionsPager


[docs]class DataformHook(GoogleBaseHook): """Hook for Google Cloud DataForm APIs.""" def __init__(self, **kwargs): if kwargs.get("delegate_to") is not None: raise RuntimeError( "The `delegate_to` parameter has been deprecated before and finally removed in this version" " of Google Provider. You MUST convert it to `impersonate_chain`" ) super().__init__(**kwargs)
[docs] def get_dataform_client(self) -> DataformClient: """Retrieve client library object that allow access to Cloud Dataform service.""" return DataformClient(credentials=self.get_credentials())
@GoogleBaseHook.fallback_to_default_project_id
[docs] def wait_for_workflow_invocation( self, workflow_invocation_id: str, repository_id: str, project_id: str, region: str, wait_time: int = 10, timeout: int | None = None, ) -> None: """ Poll a job to check if it finishes. :param workflow_invocation_id: Id of the Workflow Invocation :param repository_id: Id of the Dataform repository :param project_id: Required. The ID of the Google Cloud project the cluster belongs to. :param region: Required. The Cloud Dataproc region in which to handle the request. :param wait_time: Number of seconds between checks :param timeout: How many seconds wait for job to be ready. Used only if ``asynchronous`` is False """ if region is None: raise TypeError("missing 1 required keyword argument: 'region'") state = None start = time.monotonic() while state not in ( WorkflowInvocation.State.FAILED, WorkflowInvocation.State.SUCCEEDED, WorkflowInvocation.State.CANCELLED, ): if timeout and start + timeout < time.monotonic(): raise AirflowException( f"Timeout: workflow invocation {workflow_invocation_id} is not ready after {timeout}s" ) time.sleep(wait_time) try: workflow_invocation = self.get_workflow_invocation( project_id=project_id, region=region, repository_id=repository_id, workflow_invocation_id=workflow_invocation_id, ) state = workflow_invocation.state except Exception as err: self.log.info( "Retrying. Dataform API returned error when waiting for workflow invocation: %s", err ) if state == WorkflowInvocation.State.FAILED: raise AirflowException(f"Workflow Invocation failed:\n{workflow_invocation}") if state == WorkflowInvocation.State.CANCELLED: raise AirflowException(f"Workflow Invocation was cancelled:\n{workflow_invocation}")
@GoogleBaseHook.fallback_to_default_project_id
[docs] def create_compilation_result( self, project_id: str, region: str, repository_id: str, compilation_result: CompilationResult | dict, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> CompilationResult: """ Create a new CompilationResult in a given project and location. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param compilation_result: Required. The compilation result to create. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() parent = f"projects/{project_id}/locations/{region}/repositories/{repository_id}" return client.create_compilation_result( request={ "parent": parent, "compilation_result": compilation_result, }, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def get_compilation_result( self, project_id: str, region: str, repository_id: str, compilation_result_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> CompilationResult: """ Fetch a single CompilationResult. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param compilation_result_id: The Id of the Dataform Compilation Result :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() name = ( f"projects/{project_id}/locations/{region}/repositories/" f"{repository_id}/compilationResults/{compilation_result_id}" ) return client.get_compilation_result( request={"name": name}, retry=retry, timeout=timeout, metadata=metadata )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def create_workflow_invocation( self, project_id: str, region: str, repository_id: str, workflow_invocation: WorkflowInvocation | dict, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> WorkflowInvocation: """ Create a new WorkflowInvocation in a given Repository. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param workflow_invocation: Required. The workflow invocation resource to create. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() parent = f"projects/{project_id}/locations/{region}/repositories/{repository_id}" return client.create_workflow_invocation( request={"parent": parent, "workflow_invocation": workflow_invocation}, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def get_workflow_invocation( self, project_id: str, region: str, repository_id: str, workflow_invocation_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> WorkflowInvocation: """ Fetch a single WorkflowInvocation. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param workflow_invocation_id: Required. The workflow invocation resource's id. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() name = ( f"projects/{project_id}/locations/{region}/repositories/" f"{repository_id}/workflowInvocations/{workflow_invocation_id}" ) return client.get_workflow_invocation( request={ "name": name, }, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def query_workflow_invocation_actions( self, project_id: str, region: str, repository_id: str, workflow_invocation_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> QueryWorkflowInvocationActionsPager: """ Fetch WorkflowInvocation actions. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param workflow_invocation_id: Required. The workflow invocation resource's id. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() name = ( f"projects/{project_id}/locations/{region}/repositories/" f"{repository_id}/workflowInvocations/{workflow_invocation_id}" ) response = client.query_workflow_invocation_actions( request={ "name": name, }, retry=retry, timeout=timeout, metadata=metadata, ) return response
@GoogleBaseHook.fallback_to_default_project_id
[docs] def cancel_workflow_invocation( self, project_id: str, region: str, repository_id: str, workflow_invocation_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ): """ Request cancellation of a running WorkflowInvocation. :param project_id: Required. The ID of the Google Cloud project that the task belongs to. :param region: Required. The ID of the Google Cloud region that the task belongs to. :param repository_id: Required. The ID of the Dataform repository that the task belongs to. :param workflow_invocation_id: Required. The workflow invocation resource's id. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() name = ( f"projects/{project_id}/locations/{region}/repositories/" f"{repository_id}/workflowInvocations/{workflow_invocation_id}" ) try: workflow_invocation = self.get_workflow_invocation( project_id=project_id, region=region, repository_id=repository_id, workflow_invocation_id=workflow_invocation_id, ) state = workflow_invocation.state except Exception as err: raise AirflowException( f"Dataform API returned error when waiting for workflow invocation:\n{err}" ) if state == WorkflowInvocation.State.RUNNING: client.cancel_workflow_invocation( request={"name": name}, retry=retry, timeout=timeout, metadata=metadata ) else: self.log.info( "Workflow is not active. Either the execution has already finished or has been canceled. " "Please check the logs above for more details." )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def create_repository( self, *, project_id: str, region: str, repository_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> Repository: """ Create repository. :param project_id: Required. The ID of the Google Cloud project where repository should be. :param region: Required. The ID of the Google Cloud region where repository should be. :param repository_id: Required. The ID of the new Dataform repository. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() parent = f"projects/{project_id}/locations/{region}" request = { "parent": parent, "repository_id": repository_id, } repository = client.create_repository( request=request, retry=retry, timeout=timeout, metadata=metadata, ) return repository
@GoogleBaseHook.fallback_to_default_project_id
[docs] def delete_repository( self, *, project_id: str, region: str, repository_id: str, force: bool = True, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> None: """ Delete repository. :param project_id: Required. The ID of the Google Cloud project where repository located. :param region: Required. The ID of the Google Cloud region where repository located. :param repository_id: Required. The ID of the Dataform repository that should be deleted. :param force: If set to true, any child resources of this repository will also be deleted. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() name = f"projects/{project_id}/locations/{region}/repositories/{repository_id}" request = { "name": name, "force": force, } client.delete_repository( request=request, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def create_workspace( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> Workspace: """ Create workspace. :param project_id: Required. The ID of the Google Cloud project where workspace should be. :param region: Required. The ID of the Google Cloud region where workspace should be. :param repository_id: Required. The ID of the Dataform repository where workspace should be. :param workspace_id: Required. The ID of the new Dataform workspace. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() parent = f"projects/{project_id}/locations/{region}/repositories/{repository_id}" request = {"parent": parent, "workspace_id": workspace_id} workspace = client.create_workspace( request=request, retry=retry, timeout=timeout, metadata=metadata, ) return workspace
@GoogleBaseHook.fallback_to_default_project_id
[docs] def delete_workspace( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ): """ Delete workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace that should be deleted. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "name": workspace_path, } client.delete_workspace( request=request, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def write_file( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, filepath: str, contents: bytes, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> WriteFileResponse: """ Write a new file to the specified workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace where files should be created. :param filepath: Required. Path to file including name of the file relative to workspace root. :param contents: Required. Content of the file to be written. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "workspace": workspace_path, "path": filepath, "contents": contents, } response = client.write_file( request=request, retry=retry, timeout=timeout, metadata=metadata, ) return response
@GoogleBaseHook.fallback_to_default_project_id
[docs] def make_directory( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, path: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> dict: """ Make new directory in specified workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace where directory should be created. :param path: Required. The directory's full path including new directory name, relative to the workspace root. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "workspace": workspace_path, "path": path, } response = client.make_directory( request=request, retry=retry, timeout=timeout, metadata=metadata, ) return response
@GoogleBaseHook.fallback_to_default_project_id
[docs] def remove_directory( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, path: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ): """ Remove directory in specified workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace where directory located. :param path: Required. The directory's full path including directory name, relative to the workspace root. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "workspace": workspace_path, "path": path, } client.remove_directory( request=request, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def remove_file( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, filepath: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ): """ Remove file in specified workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace where directory located. :param filepath: Required. The full path including name of the file, relative to the workspace root. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "workspace": workspace_path, "path": filepath, } client.remove_file( request=request, retry=retry, timeout=timeout, metadata=metadata, )
@GoogleBaseHook.fallback_to_default_project_id
[docs] def install_npm_packages( self, *, project_id: str, region: str, repository_id: str, workspace_id: str, retry: Retry | _MethodDefault = DEFAULT, timeout: float | None = None, metadata: Sequence[tuple[str, str]] = (), ) -> InstallNpmPackagesResponse: """Install NPM dependencies in the provided workspace. Requires "package.json" to be created in the workspace. :param project_id: Required. The ID of the Google Cloud project where workspace located. :param region: Required. The ID of the Google Cloud region where workspace located. :param repository_id: Required. The ID of the Dataform repository where workspace located. :param workspace_id: Required. The ID of the Dataform workspace. :param retry: Designation of what errors, if any, should be retried. :param timeout: The timeout for this request. :param metadata: Strings which should be sent along with the request as metadata. """ client = self.get_dataform_client() workspace_path = ( f"projects/{project_id}/locations/{region}/" f"repositories/{repository_id}/workspaces/{workspace_id}" ) request = { "workspace": workspace_path, } response = client.install_npm_packages( request=request, retry=retry, timeout=timeout, metadata=metadata, ) return response

Was this entry helpful?