Source code for airflow.providers.databricks.operators.databricks_sql

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""This module contains Databricks operators."""

from __future__ import annotations

import csv
import json
import os
from collections.abc import Sequence
from functools import cached_property
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urlparse

from databricks.sql.utils import ParamEscaper

from airflow.providers.common.compat.sdk import (
    AirflowException,
    AirflowOptionalProviderFeatureException,
    BaseOperator,
)
from airflow.providers.common.sql.operators.sql import SQLExecuteQueryOperator
from airflow.providers.databricks.hooks.databricks_sql import DatabricksSqlHook

if TYPE_CHECKING:
    from airflow.providers.common.compat.sdk import Context



[docs]
class DatabricksSqlOperator(SQLExecuteQueryOperator):
    """
    Executes SQL code in a Databricks SQL endpoint or a Databricks cluster.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:DatabricksSqlOperator`

    :param databricks_conn_id: Reference to
        :ref:`Databricks connection id<howto/connection:databricks>` (templated)
    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
        If not specified, it should be either specified in the Databricks connection's extra parameters,
        or ``sql_endpoint_name`` must be specified.
    :param sql_endpoint_name: Optional name of Databricks SQL Endpoint. If not specified, ``http_path`` must
        be provided as described above.
    :param sql: the SQL code to be executed as a single string, or
        a list of str (sql statements), or a reference to a template file. (templated)
        Template references are recognized by str ending in '.sql'
    :param parameters: (optional) the parameters to render the SQL query with.
    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
        If not specified, it could be specified in the Databricks connection's extra parameters.
    :param client_parameters: Additional parameters internal to Databricks SQL Connector parameters
    :param http_headers: An optional list of (k, v) pairs that will be set as HTTP headers on every request.
         (templated)
    :param catalog: An optional initial catalog to use. Requires DBR version 9.0+ (templated)
    :param schema: An optional initial schema to use. Requires DBR version 9.0+ (templated)
    :param output_path: optional string specifying the file to which write selected data. (templated)
        Supports local file paths and GCS URIs (e.g., ``gs://bucket/path/file.parquet``).
        When using GCS URIs, requires the ``apache-airflow-providers-google`` package.
    :param output_format: format of output data if ``output_path`` is specified.
        Possible values are ``csv``, ``json``, ``jsonl``, ``parquet``, ``avro``. Default is ``csv``.
    :param csv_params: parameters that will be passed to the ``csv.DictWriter`` class used to write CSV data.
    :param gcp_conn_id: The connection ID to use for connecting to Google Cloud when using GCS output path.
        Default is ``google_cloud_default``.
    :param gcs_impersonation_chain: Optional service account to impersonate using short-term
        credentials for GCS upload, or chained list of accounts required to get the access_token
        of the last account in the list, which will be impersonated in the request. (templated)
    """


[docs]
    template_fields: Sequence[str] = tuple(
        {
            "_output_path",
            "schema",
            "catalog",
            "http_headers",
            "databricks_conn_id",
            "_gcs_impersonation_chain",
        }
        | set(SQLExecuteQueryOperator.template_fields)
    )



[docs]
    template_ext: Sequence[str] = (".sql",)


[docs]
    template_fields_renderers: ClassVar[dict] = {"sql": "sql"}


[docs]
    conn_id_field = "databricks_conn_id"


    def __init__(
        self,
        *,
        databricks_conn_id: str = DatabricksSqlHook.default_conn_name,
        http_path: str | None = None,
        sql_endpoint_name: str | None = None,
        session_configuration=None,
        http_headers: list[tuple[str, str]] | None = None,
        catalog: str | None = None,
        schema: str | None = None,
        output_path: str | None = None,
        output_format: str = "csv",
        csv_params: dict[str, Any] | None = None,
        client_parameters: dict[str, Any] | None = None,
        gcp_conn_id: str = "google_cloud_default",
        gcs_impersonation_chain: str | Sequence[str] | None = None,
        **kwargs,
    ) -> None:
        super().__init__(conn_id=databricks_conn_id, **kwargs)

[docs]
        self.databricks_conn_id = databricks_conn_id

        self._output_path = output_path
        self._output_format = output_format
        self._csv_params = csv_params

[docs]
        self.http_path = http_path


[docs]
        self.sql_endpoint_name = sql_endpoint_name


[docs]
        self.session_configuration = session_configuration


[docs]
        self.client_parameters = {} if client_parameters is None else client_parameters


[docs]
        self.hook_params = kwargs.pop("hook_params", {})


[docs]
        self.http_headers = http_headers


[docs]
        self.catalog = catalog


[docs]
        self.schema = schema

        self._gcp_conn_id = gcp_conn_id
        self._gcs_impersonation_chain = gcs_impersonation_chain

    @cached_property
    def _hook(self) -> DatabricksSqlHook:
        hook_params = {
            "http_path": self.http_path,
            "session_configuration": self.session_configuration,
            "sql_endpoint_name": self.sql_endpoint_name,
            "http_headers": self.http_headers,
            "catalog": self.catalog,
            "schema": self.schema,
            "caller": "DatabricksSqlOperator",
            **self.client_parameters,
            **self.hook_params,
        }
        return DatabricksSqlHook(self.databricks_conn_id, **hook_params)


[docs]
    def get_db_hook(self) -> DatabricksSqlHook:
        return self._hook


    def _should_run_output_processing(self) -> bool:
        return self.do_xcom_push or bool(self._output_path)

    @property
    def _is_gcs_output(self) -> bool:
        """Check if the output path is a GCS URI."""
        return self._output_path.startswith("gs://") if self._output_path else False

    def _parse_gcs_path(self, path: str) -> tuple[str, str]:
        """Parse a GCS URI into bucket and object name."""
        parsed = urlparse(path)
        bucket = parsed.netloc
        object_name = parsed.path.lstrip("/")
        return bucket, object_name

    def _upload_to_gcs(self, local_path: str, gcs_path: str) -> None:
        """Upload a local file to GCS."""
        try:
            from airflow.providers.google.cloud.hooks.gcs import GCSHook
        except ImportError:
            raise AirflowOptionalProviderFeatureException(
                "The 'apache-airflow-providers-google' package is required for GCS output. "
                "Install it with: pip install apache-airflow-providers-google"
            )

        bucket, object_name = self._parse_gcs_path(gcs_path)
        hook = GCSHook(
            gcp_conn_id=self._gcp_conn_id,
            impersonation_chain=self._gcs_impersonation_chain,
        )
        hook.upload(
            bucket_name=bucket,
            object_name=object_name,
            filename=local_path,
        )
        self.log.info("Uploaded output to %s", gcs_path)

    def _write_parquet(self, file_path: str, field_names: list[str], rows: list[Any]) -> None:
        """Write data to a Parquet file."""
        import pyarrow as pa
        import pyarrow.parquet as pq

        data: dict[str, list] = {name: [] for name in field_names}
        for row in rows:
            row_dict = row._asdict()
            for name in field_names:
                data[name].append(row_dict[name])

        table = pa.Table.from_pydict(data)
        pq.write_table(table, file_path)

    def _write_avro(self, file_path: str, field_names: list[str], rows: list[Any]) -> None:
        """Write data to an Avro file using fastavro."""
        try:
            from fastavro import writer
        except ImportError:
            raise AirflowOptionalProviderFeatureException(
                "The 'fastavro' package is required for Avro output. Install it with: pip install fastavro"
            )

        data: dict[str, list] = {name: [] for name in field_names}
        for row in rows:
            row_dict = row._asdict()
            for name in field_names:
                data[name].append(row_dict[name])

        schema_fields = []
        for name in field_names:
            sample_val = next(
                (data[name][i] for i in range(len(data[name])) if data[name][i] is not None), None
            )
            if sample_val is None:
                avro_type = ["null", "string"]
            elif isinstance(sample_val, bool):
                avro_type = ["null", "boolean"]
            elif isinstance(sample_val, int):
                avro_type = ["null", "long"]
            elif isinstance(sample_val, float):
                avro_type = ["null", "double"]
            else:
                avro_type = ["null", "string"]
            schema_fields.append({"name": name, "type": avro_type})

        avro_schema = {
            "type": "record",
            "name": "QueryResult",
            "fields": schema_fields,
        }

        records = [row._asdict() for row in rows]
        with open(file_path, "wb") as f:
            writer(f, avro_schema, records)

    def _process_output(self, results: list[Any], descriptions: list[Sequence[Sequence] | None]) -> list[Any]:
        if not self._output_path:
            return list(zip(descriptions, results))
        if not self._output_format:
            raise AirflowException("Output format should be specified!")

        last_description = descriptions[-1]
        last_results = results[-1]
        if last_description is None:
            raise AirflowException("There is missing description present for the output file.")
        field_names = [field[0] for field in last_description]

        if self._is_gcs_output:
            suffix = f".{self._output_format.lower()}"
            tmp_file = NamedTemporaryFile(mode="w", suffix=suffix, delete=False, newline="")
            local_path = tmp_file.name
            tmp_file.close()
        else:
            local_path = self._output_path

        try:
            output_format = self._output_format.lower()
            if output_format == "csv":
                with open(local_path, "w", newline="") as file:
                    if self._csv_params:
                        csv_params = self._csv_params.copy()
                    else:
                        csv_params = {}
                    write_header = csv_params.pop("header", True)
                    writer = csv.DictWriter(file, fieldnames=field_names, **csv_params)
                    if write_header:
                        writer.writeheader()
                    for row in last_results:
                        writer.writerow(row._asdict())
            elif output_format == "json":
                with open(local_path, "w") as file:
                    file.write(json.dumps([row._asdict() for row in last_results]))
            elif output_format == "jsonl":
                with open(local_path, "w") as file:
                    for row in last_results:
                        file.write(json.dumps(row._asdict()))
                        file.write("\n")
            elif output_format == "parquet":
                self._write_parquet(local_path, field_names, last_results)
            elif output_format == "avro":
                self._write_avro(local_path, field_names, last_results)
            else:
                raise ValueError(f"Unsupported output format: '{self._output_format}'")

            if self._is_gcs_output:
                self._upload_to_gcs(local_path, self._output_path)
        finally:
            if self._is_gcs_output and os.path.exists(local_path):
                os.unlink(local_path)

        return list(zip(descriptions, results))




[docs]
COPY_INTO_APPROVED_FORMATS = ["CSV", "JSON", "AVRO", "ORC", "PARQUET", "TEXT", "BINARYFILE"]




[docs]
class DatabricksCopyIntoOperator(BaseOperator):
    """
    Executes COPY INTO command in a Databricks SQL endpoint or a Databricks cluster.

    COPY INTO command is constructed from individual pieces, that are described in
    `documentation <https://docs.databricks.com/sql/language-manual/delta-copy-into.html>`_.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:DatabricksSqlCopyIntoOperator`

    :param table_name: Required name of the table. (templated)
    :param file_location: Required location of files to import. (templated)
    :param file_format: Required file format. Supported formats are
        ``CSV``, ``JSON``, ``AVRO``, ``ORC``, ``PARQUET``, ``TEXT``, ``BINARYFILE``.
    :param databricks_conn_id: Reference to
        :ref:`Databricks connection id<howto/connection:databricks>` (templated)
    :param http_path: Optional string specifying HTTP path of Databricks SQL Endpoint or cluster.
        If not specified, it should be either specified in the Databricks connection's extra parameters,
        or ``sql_endpoint_name`` must be specified.
    :param sql_endpoint_name: Optional name of Databricks SQL Endpoint.
        If not specified, ``http_path`` must be provided as described above.
    :param session_configuration: An optional dictionary of Spark session parameters. Defaults to None.
        If not specified, it could be specified in the Databricks connection's extra parameters.
    :param http_headers: An optional list of (k, v) pairs that will be set as HTTP headers on every request
    :param catalog: An optional initial catalog to use. Requires DBR version 9.0+
    :param schema: An optional initial schema to use. Requires DBR version 9.0+
    :param client_parameters: Additional parameters internal to Databricks SQL Connector parameters
    :param files: optional list of files to import. Can't be specified together with ``pattern``. (templated)
    :param pattern: optional regex string to match file names to import.
        Can't be specified together with ``files``.
    :param expression_list: optional string that will be used in the ``SELECT`` expression.
    :param credential: optional credential configuration for authentication against a source location.
    :param storage_credential: optional Unity Catalog storage credential for destination.
    :param encryption: optional encryption configuration for a specified location.
    :param format_options: optional dictionary with options specific for a given file format.
    :param force_copy: optional bool to control forcing of data import
        (could be also specified in ``copy_options``).
    :param validate: optional configuration for schema & data validation. ``True`` forces validation
        of all rows, integer number - validate only N first rows
    :param copy_options: optional dictionary of copy options. Right now only ``force`` option is supported.
    """


[docs]
    template_fields: Sequence[str] = (
        "file_location",
        "files",
        "table_name",
        "databricks_conn_id",
    )


    def __init__(
        self,
        *,
        table_name: str,
        file_location: str,
        file_format: str,
        databricks_conn_id: str = DatabricksSqlHook.default_conn_name,
        http_path: str | None = None,
        sql_endpoint_name: str | None = None,
        session_configuration=None,
        http_headers: list[tuple[str, str]] | None = None,
        client_parameters: dict[str, Any] | None = None,
        catalog: str | None = None,
        schema: str | None = None,
        files: list[str] | None = None,
        pattern: str | None = None,
        expression_list: str | None = None,
        credential: dict[str, str] | None = None,
        storage_credential: str | None = None,
        encryption: dict[str, str] | None = None,
        format_options: dict[str, str] | None = None,
        force_copy: bool | None = None,
        copy_options: dict[str, str] | None = None,
        validate: bool | int | None = None,
        **kwargs,
    ) -> None:
        """Create a new ``DatabricksSqlOperator``."""
        super().__init__(**kwargs)
        if files is not None and pattern is not None:
            raise AirflowException("Only one of 'pattern' or 'files' should be specified")
        if table_name == "":
            raise AirflowException("table_name shouldn't be empty")
        if file_location == "":
            raise AirflowException("file_location shouldn't be empty")
        if file_format not in COPY_INTO_APPROVED_FORMATS:
            raise AirflowException(f"file_format '{file_format}' isn't supported")

[docs]
        self.files = files

        self._pattern = pattern
        self._file_format = file_format

[docs]
        self.databricks_conn_id = databricks_conn_id

        self._http_path = http_path
        self._sql_endpoint_name = sql_endpoint_name

[docs]
        self.session_config = session_configuration


[docs]
        self.table_name = table_name

        self._catalog = catalog
        self._schema = schema

[docs]
        self.file_location = file_location

        self._expression_list = expression_list
        self._credential = credential
        self._storage_credential = storage_credential
        self._encryption = encryption
        self._format_options = format_options
        self._copy_options = copy_options or {}
        self._validate = validate
        self._http_headers = http_headers
        self._client_parameters = client_parameters or {}
        if force_copy is not None:
            self._copy_options["force"] = "true" if force_copy else "false"
        self._sql: str | None = None

    def _get_hook(self) -> DatabricksSqlHook:
        return self._hook

    @cached_property
    def _hook(self) -> DatabricksSqlHook:
        return DatabricksSqlHook(
            self.databricks_conn_id,
            http_path=self._http_path,
            session_configuration=self.session_config,
            sql_endpoint_name=self._sql_endpoint_name,
            http_headers=self._http_headers,
            catalog=self._catalog,
            schema=self._schema,
            caller="DatabricksCopyIntoOperator",
            **self._client_parameters,
        )

    @staticmethod
    def _generate_options(
        name: str,
        escaper: ParamEscaper,
        opts: dict[str, str] | None = None,
        escape_key: bool = True,
    ) -> str:
        formatted_opts = ""
        if opts:
            pairs = [
                f"{escaper.escape_item(k) if escape_key else k} = {escaper.escape_item(v)}"
                for k, v in opts.items()
            ]
            formatted_opts = f"{name} ({', '.join(pairs)})"

        return formatted_opts

    def _create_sql_query(self) -> str:
        escaper = ParamEscaper()
        maybe_with = ""
        if self._encryption is not None or self._credential is not None:
            maybe_encryption = ""
            if self._encryption is not None:
                maybe_encryption = self._generate_options("ENCRYPTION", escaper, self._encryption, False)
            maybe_credential = ""
            if self._credential is not None:
                maybe_credential = self._generate_options("CREDENTIAL", escaper, self._credential, False)
            maybe_with = f" WITH ({maybe_credential} {maybe_encryption})"
        location = escaper.escape_item(self.file_location) + maybe_with
        if self._expression_list is not None:
            location = f"(SELECT {self._expression_list} FROM {location})"
        files_or_pattern = ""
        if self._pattern is not None:
            files_or_pattern = f"PATTERN = {escaper.escape_item(self._pattern)}\n"
        elif self.files is not None:
            files_or_pattern = f"FILES = {escaper.escape_item(self.files)}\n"
        format_options = self._generate_options("FORMAT_OPTIONS", escaper, self._format_options) + "\n"
        copy_options = self._generate_options("COPY_OPTIONS", escaper, self._copy_options) + "\n"
        storage_cred = ""
        if self._storage_credential:
            storage_cred = f" WITH (CREDENTIAL {self._storage_credential})"
        validation = ""
        if self._validate is not None:
            if isinstance(self._validate, bool):
                if self._validate:
                    validation = "VALIDATE ALL\n"
            elif isinstance(self._validate, int):
                if self._validate < 0:
                    raise AirflowException(
                        f"Number of rows for validation should be positive, got: {self._validate}"
                    )
                validation = f"VALIDATE {self._validate} ROWS\n"
            else:
                raise AirflowException(f"Incorrect data type for validate parameter: {type(self._validate)}")
        # TODO: think on how to make sure that table_name and expression_list aren't used for SQL injection
        sql = f"""COPY INTO {self.table_name}{storage_cred}
FROM {location}
FILEFORMAT = {self._file_format}
{validation}{files_or_pattern}{format_options}{copy_options}
"""
        return sql.strip()


[docs]
    def execute(self, context: Context) -> Any:
        self._sql = self._create_sql_query()
        self.log.info("Executing: %s", self._sql)
        hook = self._get_hook()
        hook.run(self._sql)



[docs]
    def on_kill(self) -> None:
        # NB: on_kill isn't required for this operator since query cancelling gets
        # handled in `DatabricksSqlHook.run()` method which is called in `execute()`
        ...


    def _build_input_openlineage_dataset(self) -> tuple[Any, list[Any]]:
        """Parse file_location to build the OpenLineage input dataset."""
        from urllib.parse import urlparse

        from airflow.providers.common.compat.openlineage.facet import Dataset, Error

        try:
            uri = urlparse(self.file_location)

            # Only process schemes we know produce valid OL datasets with current implementation
            if uri.scheme not in ("s3", "s3a", "s3n", "gs", "abfss", "wasbs"):
                raise ValueError(f"Unsupported scheme: `{uri.scheme}` in `{self.file_location}`")

            namespace = f"{uri.scheme}://{uri.netloc}"
            name = uri.path.strip("/")
            if name in ("", "."):
                name = "/"
            return Dataset(namespace=namespace, name=name), []
        except Exception as e:
            self.log.debug("Failed to parse file_location: `%s`, error: %s", self.file_location, str(e))
            extraction_errors = [
                Error(errorMessage=str(e), stackTrace=None, task=self.file_location, taskNumber=None)
            ]
            return None, extraction_errors

    def _build_output_openlineage_dataset(self, namespace: str) -> tuple[Any, list[Any]]:
        """Build output OpenLineage dataset from table information."""
        from airflow.providers.common.compat.openlineage.facet import Dataset, Error

        try:
            table_parts = self.table_name.split(".")
            if len(table_parts) == 3:  # catalog.schema.table
                catalog, schema, table = table_parts
            elif len(table_parts) == 2:  # schema.table
                catalog = None
                schema, table = table_parts
            else:
                catalog = None
                schema = None
                table = self.table_name

            hook = self._get_hook()
            schema = schema or hook.get_openlineage_default_schema()  # Fallback to default schema
            catalog = catalog or hook.catalog  # Fallback to default catalog, if provided

            # Combine schema/table with optional catalog for final dataset name
            fq_name = table
            if schema:
                fq_name = f"{schema}.{fq_name}"
            if catalog:
                fq_name = f"{catalog}.{fq_name}"

            return Dataset(namespace=namespace, name=fq_name), []
        except Exception as e:
            self.log.debug("Failed to construct output dataset: `%s`, error: %s", self.table_name, str(e))
            extraction_errors = [
                Error(errorMessage=str(e), stackTrace=None, task=self.table_name, taskNumber=None)
            ]
            return None, extraction_errors


[docs]
    def get_openlineage_facets_on_complete(self, _):
        """Implement _on_complete as we are attaching query id."""
        from airflow.providers.common.compat.openlineage.facet import (
            ExternalQueryRunFacet,
            ExtractionErrorRunFacet,
            SQLJobFacet,
        )
        from airflow.providers.openlineage.extractors import OperatorLineage
        from airflow.providers.openlineage.sqlparser import SQLParser

        if not self._sql:
            self.log.warning("No SQL query found, returning empty OperatorLineage.")
            return OperatorLineage()

        hook = self._get_hook()
        run_facets = {}

        connection = hook.get_connection(self.databricks_conn_id)
        database_info = hook.get_openlineage_database_info(connection)
        dbx_namespace = SQLParser.create_namespace(database_info)

        if hook.query_ids:
            run_facets["externalQuery"] = ExternalQueryRunFacet(
                externalQueryId=hook.query_ids[0], source=dbx_namespace
            )

        input_dataset, extraction_errors = self._build_input_openlineage_dataset()
        output_dataset, output_errors = self._build_output_openlineage_dataset(dbx_namespace)
        extraction_errors.extend(output_errors)

        if extraction_errors:
            run_facets["extractionError"] = ExtractionErrorRunFacet(
                totalTasks=1,
                failedTasks=len(extraction_errors),
                errors=extraction_errors,
            )

        return OperatorLineage(
            inputs=[input_dataset] if input_dataset else [],
            outputs=[output_dataset] if output_dataset else [],
            job_facets={"sql": SQLJobFacet(query=SQLParser.normalize_sql(self._sql))},
            run_facets=run_facets,
        )