Source code for airflow.providers.amazon.aws.transfers.dynamodb_to_s3

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#

"""
This module contains operators to replicate records from
DynamoDB table to S3.
"""
import json
from copy import copy
from os.path import getsize
from tempfile import NamedTemporaryFile
from typing import IO, TYPE_CHECKING, Any, Callable, Dict, Optional
from uuid import uuid4

from airflow.models import BaseOperator
from airflow.providers.amazon.aws.hooks.dynamodb import DynamoDBHook
from airflow.providers.amazon.aws.hooks.s3 import S3Hook

if TYPE_CHECKING:
    from airflow.utils.context import Context


def _convert_item_to_json_bytes(item: Dict[str, Any]) -> bytes:
    return (json.dumps(item) + '\n').encode('utf-8')


def _upload_file_to_s3(
    file_obj: IO, bucket_name: str, s3_key_prefix: str, aws_conn_id: str = 'aws_default'
) -> None:
    s3_client = S3Hook(aws_conn_id=aws_conn_id).get_conn()
    file_obj.seek(0)
    s3_client.upload_file(
        Filename=file_obj.name,
        Bucket=bucket_name,
        Key=s3_key_prefix + str(uuid4()),
    )


[docs]class DynamoDBToS3Operator(BaseOperator): """ Replicates records from a DynamoDB table to S3. It scans a DynamoDB table and write the received records to a file on the local filesystem. It flushes the file to S3 once the file size exceeds the file size limit specified by the user. Users can also specify a filtering criteria using dynamodb_scan_kwargs to only replicate records that satisfy the criteria. To parallelize the replication, users can create multiple tasks of DynamoDBToS3Operator. For instance to replicate with parallelism of 2, create two tasks like: .. code-block:: python op1 = DynamoDBToS3Operator( task_id="replicator-1", dynamodb_table_name="hello", dynamodb_scan_kwargs={ "TotalSegments": 2, "Segment": 0, }, ..., ) op2 = DynamoDBToS3Operator( task_id="replicator-2", dynamodb_table_name="hello", dynamodb_scan_kwargs={ "TotalSegments": 2, "Segment": 1, }, ..., ) :param dynamodb_table_name: Dynamodb table to replicate data from :param s3_bucket_name: S3 bucket to replicate data to :param file_size: Flush file to s3 if file size >= file_size :param dynamodb_scan_kwargs: kwargs pass to <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb.html#DynamoDB.Table.scan> # noqa: E501 :param s3_key_prefix: Prefix of s3 object key :param process_func: How we transforms a dynamodb item to bytes. By default we dump the json :param aws_conn_id: The Airflow connection used for AWS credentials. If this is None or empty then the default boto3 behaviour is used. If running Airflow in a distributed manner and aws_conn_id is None or empty, then default boto3 configuration would be used (and must be maintained on each worker node). """ def __init__( self, *, dynamodb_table_name: str, s3_bucket_name: str, file_size: int, dynamodb_scan_kwargs: Optional[Dict[str, Any]] = None, s3_key_prefix: str = '', process_func: Callable[[Dict[str, Any]], bytes] = _convert_item_to_json_bytes, aws_conn_id: str = 'aws_default', **kwargs, ) -> None: super().__init__(**kwargs) self.file_size = file_size self.process_func = process_func self.dynamodb_table_name = dynamodb_table_name self.dynamodb_scan_kwargs = dynamodb_scan_kwargs self.s3_bucket_name = s3_bucket_name self.s3_key_prefix = s3_key_prefix self.aws_conn_id = aws_conn_id
[docs] def execute(self, context: 'Context') -> None: hook = DynamoDBHook(aws_conn_id=self.aws_conn_id) table = hook.get_conn().Table(self.dynamodb_table_name) scan_kwargs = copy(self.dynamodb_scan_kwargs) if self.dynamodb_scan_kwargs else {} err = None with NamedTemporaryFile() as f: try: f = self._scan_dynamodb_and_upload_to_s3(f, scan_kwargs, table) except Exception as e: err = e raise e finally: if err is None: _upload_file_to_s3(f, self.s3_bucket_name, self.s3_key_prefix, self.aws_conn_id)
def _scan_dynamodb_and_upload_to_s3(self, temp_file: IO, scan_kwargs: dict, table: Any) -> IO: while True: response = table.scan(**scan_kwargs) items = response['Items'] for item in items: temp_file.write(self.process_func(item)) if 'LastEvaluatedKey' not in response: # no more items to scan break last_evaluated_key = response['LastEvaluatedKey'] scan_kwargs['ExclusiveStartKey'] = last_evaluated_key # Upload the file to S3 if reach file size limit if getsize(temp_file.name) >= self.file_size: _upload_file_to_s3(temp_file, self.s3_bucket_name, self.s3_key_prefix, self.aws_conn_id) temp_file.close() temp_file = NamedTemporaryFile() return temp_file

Was this entry helpful?