#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from typing import Callable, List, Optional, Union
from urllib.parse import urlparse
from airflow.exceptions import AirflowException
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.sensors.base import BaseSensorOperator
[docs]class S3KeySensor(BaseSensorOperator):
"""
Waits for a key (a file-like instance on S3) to be present in a S3 bucket.
S3 being a key/value it does not support folders. The path is just a key
a resource.
:param bucket_key: The key being waited on. Supports full s3:// style url
or relative path from root level. When it's specified as a full s3://
url, please leave bucket_name as `None`.
:type bucket_key: str
:param bucket_name: Name of the S3 bucket. Only needed when ``bucket_key``
is not provided as a full s3:// url.
:type bucket_name: str
:param wildcard_match: whether the bucket_key should be interpreted as a
Unix wildcard pattern
:type wildcard_match: bool
:param aws_conn_id: a reference to the s3 connection
:type aws_conn_id: str
:param verify: Whether or not to verify SSL certificates for S3 connection.
By default SSL certificates are verified.
You can provide the following values:
- ``False``: do not validate SSL certificates. SSL will still be used
(unless use_ssl is False), but SSL certificates will not be
verified.
- ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
You can specify this argument if you want to use a different
CA cert bundle than the one used by botocore.
:type verify: bool or str
"""
[docs] template_fields = ('bucket_key', 'bucket_name')
def __init__(
self,
*,
bucket_key: str,
bucket_name: Optional[str] = None,
wildcard_match: bool = False,
aws_conn_id: str = 'aws_default',
verify: Optional[Union[str, bool]] = None,
**kwargs,
):
super().__init__(**kwargs)
self.bucket_name = bucket_name
self.bucket_key = bucket_key
self.wildcard_match = wildcard_match
self.aws_conn_id = aws_conn_id
self.verify = verify
self.hook: Optional[S3Hook] = None
[docs] def poke(self, context):
if self.bucket_name is None:
parsed_url = urlparse(self.bucket_key)
if parsed_url.netloc == '':
raise AirflowException('If key is a relative path from root, please provide a bucket_name')
self.bucket_name = parsed_url.netloc
self.bucket_key = parsed_url.path.lstrip('/')
else:
parsed_url = urlparse(self.bucket_key)
if parsed_url.scheme != '' or parsed_url.netloc != '':
raise AirflowException(
'If bucket_name is provided, bucket_key'
+ ' should be relative path from root'
+ ' level, rather than a full s3:// url'
)
self.log.info('Poking for key : s3://%s/%s', self.bucket_name, self.bucket_key)
if self.wildcard_match:
return self.get_hook().check_for_wildcard_key(self.bucket_key, self.bucket_name)
return self.get_hook().check_for_key(self.bucket_key, self.bucket_name)
[docs] def get_hook(self) -> S3Hook:
"""Create and return an S3Hook"""
if self.hook:
return self.hook
self.hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
return self.hook
[docs]class S3KeySizeSensor(S3KeySensor):
"""
Waits for a key (a file-like instance on S3) to be present and be more than
some size in a S3 bucket.
S3 being a key/value it does not support folders. The path is just a key
a resource.
:param bucket_key: The key being waited on. Supports full s3:// style url
or relative path from root level. When it's specified as a full s3://
url, please leave bucket_name as `None`.
:type bucket_key: str
:param bucket_name: Name of the S3 bucket. Only needed when ``bucket_key``
is not provided as a full s3:// url.
:type bucket_name: str
:param wildcard_match: whether the bucket_key should be interpreted as a
Unix wildcard pattern
:type wildcard_match: bool
:param aws_conn_id: a reference to the s3 connection
:type aws_conn_id: str
:param verify: Whether or not to verify SSL certificates for S3 connection.
By default SSL certificates are verified.
You can provide the following values:
- ``False``: do not validate SSL certificates. SSL will still be used
(unless use_ssl is False), but SSL certificates will not be
verified.
- ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
You can specify this argument if you want to use a different
CA cert bundle than the one used by botocore.
:type verify: bool or str
:type check_fn: Optional[Callable[..., bool]]
:param check_fn: Function that receives the list of the S3 objects,
and returns the boolean:
- ``True``: a certain criteria is met
- ``False``: the criteria isn't met
**Example**: Wait for any S3 object size more than 1 megabyte ::
def check_fn(self, data: List) -> bool:
return any(f.get('Size', 0) > 1048576 for f in data if isinstance(f, dict))
:type check_fn: Optional[Callable[..., bool]]
"""
def __init__(
self,
*,
check_fn: Optional[Callable[..., bool]] = None,
**kwargs,
):
super().__init__(**kwargs)
self.check_fn_user = check_fn
[docs] def poke(self, context):
if super().poke(context=context) is False:
return False
s3_objects = self.get_files(s3_hook=self.get_hook())
if not s3_objects:
return False
check_fn = self.check_fn if self.check_fn_user is None else self.check_fn_user
return check_fn(s3_objects)
[docs] def get_files(self, s3_hook: S3Hook, delimiter: Optional[str] = '/') -> List:
"""Gets a list of files in the bucket"""
prefix = self.bucket_key
config = {
'PageSize': None,
'MaxItems': None,
}
if self.wildcard_match:
prefix = re.split(r'[*]', self.bucket_key, 1)[0]
paginator = s3_hook.get_conn().get_paginator('list_objects_v2')
response = paginator.paginate(
Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
)
keys = []
for page in response:
if 'Contents' in page:
_temp = [k for k in page['Contents'] if isinstance(k.get('Size', None), (int, float))]
keys = keys + _temp
return keys
[docs] def check_fn(self, data: List, object_min_size: Optional[Union[int, float]] = 0) -> bool:
"""Default function for checking that S3 Objects have size more than 0
:param data: List of the objects in S3 bucket.
:type data: list
:param object_min_size: Checks if the objects sizes are greater then this value.
:type object_min_size: int
"""
return all(f.get('Size', 0) > object_min_size for f in data if isinstance(f, dict))