Source code for tests.system.providers.amazon.aws.example_comprehend_document_classifier
# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# "License"); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing,# software distributed under the License is distributed on an# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY# KIND, either express or implied. See the License for the# specific language governing permissions and limitations# under the License.from__future__importannotationsimportosfromdatetimeimportdatetimefromairflowimportDAG,settingsfromairflow.decoratorsimporttask,task_groupfromairflow.modelsimportConnectionfromairflow.models.baseoperatorimportchainfromairflow.providers.amazon.aws.hooks.comprehendimportComprehendHookfromairflow.providers.amazon.aws.operators.comprehendimport(ComprehendCreateDocumentClassifierOperator,)fromairflow.providers.amazon.aws.operators.s3import(S3CreateBucketOperator,S3CreateObjectOperator,S3DeleteBucketOperator,)fromairflow.providers.amazon.aws.sensors.comprehendimport(ComprehendCreateDocumentClassifierCompletedSensor,)fromairflow.providers.amazon.aws.transfers.http_to_s3importHttpToS3Operatorfromairflow.utils.trigger_ruleimportTriggerRulefromtests.system.providers.amazon.aws.utilsimportSystemTestContextBuilder
# To create a custom document classifier, we need a minimum of 10 documents for each label.# for testing purpose, we will generate 10 copies of each document referenced below.
[docs]defcopy_data_to_s3(bucket:str,sources:list[dict],prefix:str,number_of_copies=1):""" Copy some sample data to S3 using HttpToS3Operator. :param bucket: Name of the Amazon S3 bucket to send the data. :param prefix: Folder to store the files :param number_of_copies: Number of files to create for a document from the sources :param sources: Public available data locations """""" EX: If number_of_copies is 2, sources has file name 'file.pdf', and prefix is 'training-docs'. Will generate two copies and upload to s3: - training-docs/file-0.pdf - training-docs/file-1.pdf """http_to_s3_configs=[{"endpoint":source["endpoint"],"s3_key":f"{prefix}/{os.path.splitext(os.path.basename(source['fileName']))[0]}-{counter}{os.path.splitext(os.path.basename(source['fileName']))[1]}",}forcounterinrange(number_of_copies)forsourceinsources]@taskdefcreate_connection(conn_id):conn=Connection(conn_id=conn_id,conn_type="http",host="https://github.com/",)session=settings.Session()session.add(conn)session.commit()@task(trigger_rule=TriggerRule.ALL_DONE)defdelete_connection(conn_id):session=settings.Session()conn_to_details=session.query(Connection).filter(Connection.conn_id==conn_id).first()session.delete(conn_to_details)session.commit()http_to_s3_task=HttpToS3Operator.partial(task_id="http_to_s3_task",http_conn_id=http_conn_id,s3_bucket=bucket,).expand_kwargs(http_to_s3_configs)chain(create_connection(http_conn_id),http_to_s3_task,delete_connection(http_conn_id))
env_id=test_context["ENV_ID"]classifier_name=f"{env_id}-custom-document-classifier"bucket_name=f"{env_id}-comprehend-document-classifier"http_conn_id=f"{env_id}-git"input_data_configurations={"S3Uri":f"s3://{bucket_name}/{ANNOTATION_BUCKET_KEY}","DataFormat":"COMPREHEND_CSV","DocumentType":"SEMI_STRUCTURED_DOCUMENT","Documents":{"S3Uri":f"s3://{bucket_name}/{TRAINING_DATA_PREFIX}/"},"DocumentReaderConfig":{"DocumentReadAction":"TEXTRACT_DETECT_DOCUMENT_TEXT","DocumentReadMode":"SERVICE_DEFAULT",},}output_data_configurations={"S3Uri":f"s3://{bucket_name}/output/"}document_classifier_kwargs={"VersionName":"v1"}create_bucket=S3CreateBucketOperator(task_id="create_bucket",bucket_name=bucket_name,)upload_annotation_file=S3CreateObjectOperator(task_id="upload_annotation_file",s3_bucket=bucket_name,s3_key=ANNOTATION_BUCKET_KEY,data=ANNOTATIONS.encode("utf-8"),)delete_bucket=S3DeleteBucketOperator(task_id="delete_bucket",trigger_rule=TriggerRule.ALL_DONE,bucket_name=bucket_name,force_delete=True,)chain(test_context,create_bucket,upload_annotation_file,copy_data_to_s3(bucket=bucket_name,sources=PUBLIC_DATA_SOURCES,prefix=TRAINING_DATA_PREFIX,number_of_copies=10),# TEST BODYdocument_classifier_workflow(),# TEST TEARDOWNdelete_bucket,)fromtests.system.utils.watcherimportwatcher# This test needs watcher in order to properly mark success/failure# when "tearDown" task with trigger rule is part of the DAGlist(dag.tasks)>>watcher()fromtests.system.utilsimportget_test_run# noqa: E402# Needed to run the example DAG with pytest (see: tests/system/README.md#run_via_pytest)