#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
import re
from typing import Callable, List, Optional, Union
from urllib.parse import urlparse
from airflow.exceptions import AirflowException
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.sensors.base import BaseSensorOperator
[docs]class S3KeySensor(BaseSensorOperator):
    """
    Waits for a key (a file-like instance on S3) to be present in a S3 bucket.
    S3 being a key/value it does not support folders. The path is just a key
    a resource.
    :param bucket_key: The key being waited on. Supports full s3:// style url
        or relative path from root level. When it's specified as a full s3://
        url, please leave bucket_name as `None`.
    :type bucket_key: str
    :param bucket_name: Name of the S3 bucket. Only needed when ``bucket_key``
        is not provided as a full s3:// url.
    :type bucket_name: str
    :param wildcard_match: whether the bucket_key should be interpreted as a
        Unix wildcard pattern
    :type wildcard_match: bool
    :param aws_conn_id: a reference to the s3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:
        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    """
[docs]    template_fields = ('bucket_key', 'bucket_name') 
    def __init__(
        self,
        *,
        bucket_key: str,
        bucket_name: Optional[str] = None,
        wildcard_match: bool = False,
        aws_conn_id: str = 'aws_default',
        verify: Optional[Union[str, bool]] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.bucket_name = bucket_name
        self.bucket_key = bucket_key
        self.wildcard_match = wildcard_match
        self.aws_conn_id = aws_conn_id
        self.verify = verify
        self.hook: Optional[S3Hook] = None
[docs]    def poke(self, context):
        if self.bucket_name is None:
            parsed_url = urlparse(self.bucket_key)
            if parsed_url.netloc == '':
                raise AirflowException('If key is a relative path from root, please provide a bucket_name')
            self.bucket_name = parsed_url.netloc
            self.bucket_key = parsed_url.path.lstrip('/')
        else:
            parsed_url = urlparse(self.bucket_key)
            if parsed_url.scheme != '' or parsed_url.netloc != '':
                raise AirflowException(
                    'If bucket_name is provided, bucket_key'
                    + ' should be relative path from root'
                    + ' level, rather than a full s3:// url'
                )
        self.log.info('Poking for key : s3://%s/%s', self.bucket_name, self.bucket_key)
        if self.wildcard_match:
            return self.get_hook().check_for_wildcard_key(self.bucket_key, self.bucket_name)
        return self.get_hook().check_for_key(self.bucket_key, self.bucket_name) 
[docs]    def get_hook(self) -> S3Hook:
        """Create and return an S3Hook"""
        if self.hook:
            return self.hook
        self.hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)
        return self.hook  
[docs]class S3KeySizeSensor(S3KeySensor):
    """
    Waits for a key (a file-like instance on S3) to be present and be more than
    some size in a S3 bucket.
    S3 being a key/value it does not support folders. The path is just a key
    a resource.
    :param bucket_key: The key being waited on. Supports full s3:// style url
        or relative path from root level. When it's specified as a full s3://
        url, please leave bucket_name as `None`.
    :type bucket_key: str
    :param bucket_name: Name of the S3 bucket. Only needed when ``bucket_key``
        is not provided as a full s3:// url.
    :type bucket_name: str
    :param wildcard_match: whether the bucket_key should be interpreted as a
        Unix wildcard pattern
    :type wildcard_match: bool
    :param aws_conn_id: a reference to the s3 connection
    :type aws_conn_id: str
    :param verify: Whether or not to verify SSL certificates for S3 connection.
        By default SSL certificates are verified.
        You can provide the following values:
        - ``False``: do not validate SSL certificates. SSL will still be used
                 (unless use_ssl is False), but SSL certificates will not be
                 verified.
        - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses.
                 You can specify this argument if you want to use a different
                 CA cert bundle than the one used by botocore.
    :type verify: bool or str
    :type check_fn: Optional[Callable[..., bool]]
    :param check_fn: Function that receives the list of the S3 objects,
        and returns the boolean:
        - ``True``: a certain criteria is met
        - ``False``: the criteria isn't met
        **Example**: Wait for any S3 object size more than 1 megabyte  ::
            def check_fn(self, data: List) -> bool:
                return any(f.get('Size', 0) > 1048576 for f in data if isinstance(f, dict))
    :type check_fn: Optional[Callable[..., bool]]
    """
    def __init__(
        self,
        *,
        check_fn: Optional[Callable[..., bool]] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.check_fn_user = check_fn
[docs]    def poke(self, context):
        if super().poke(context=context) is False:
            return False
        s3_objects = self.get_files(s3_hook=self.get_hook())
        if not s3_objects:
            return False
        check_fn = self.check_fn if self.check_fn_user is None else self.check_fn_user
        return check_fn(s3_objects) 
[docs]    def get_files(self, s3_hook: S3Hook, delimiter: Optional[str] = '/') -> List:
        """Gets a list of files in the bucket"""
        prefix = self.bucket_key
        config = {
            'PageSize': None,
            'MaxItems': None,
        }
        if self.wildcard_match:
            prefix = re.split(r'[\[\*\?]', self.bucket_key, 1)[0]
        paginator = s3_hook.get_conn().get_paginator('list_objects_v2')
        response = paginator.paginate(
            Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config
        )
        keys = []
        for page in response:
            if 'Contents' in page:
                _temp = [k for k in page['Contents'] if isinstance(k.get('Size', None), (int, float))]
                keys = keys + _temp
        return keys 
[docs]    def check_fn(self, data: List, object_min_size: Optional[Union[int, float]] = 0) -> bool:
        """Default function for checking that S3 Objects have size more than 0
        :param data: List of the objects in S3 bucket.
        :type data: list
        :param object_min_size: Checks if the objects sizes are greater then this value.
        :type object_min_size: int
        """
        return all(f.get('Size', 0) > object_min_size for f in data if isinstance(f, dict))