Source code for tests.system.amazon.aws.example_bedrock_evaluation

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

from datetime import datetime

from airflow.providers.amazon.aws.operators.bedrock import BedrockCreateEvaluationJobOperator
from airflow.providers.amazon.aws.operators.s3 import (
    S3CreateBucketOperator,
    S3CreateObjectOperator,
    S3DeleteBucketOperator,
)
from airflow.providers.common.compat.sdk import DAG, chain

from system.amazon.aws.utils import ENV_ID_KEY, SystemTestContextBuilder
from tests_common.test_utils.version_compat import AIRFLOW_V_3_0_PLUS

if AIRFLOW_V_3_0_PLUS:
    from airflow.sdk import TriggerRule, task
else:
    from airflow.decorators import task  # type: ignore[attr-defined,no-redef]
    from airflow.utils.trigger_rule import TriggerRule  # type: ignore[no-redef,attr-defined]

[docs] DAG_ID = "example_bedrock_evaluation"
[docs] ROLE_ARN_KEY = "ROLE_ARN"
[docs] sys_test_context_task = SystemTestContextBuilder().add_variable(ROLE_ARN_KEY).build()
# Minimal JSONL dataset for Summarization task
[docs] EVAL_DATASET = ( '{"prompt": "Summarize: The quick brown fox jumps over the lazy dog.",' ' "referenceResponse": "A fox jumps over a dog."}\n' )
with DAG( dag_id=DAG_ID, schedule=None, start_date=datetime(2024, 1, 1), catchup=False, ) as dag:
[docs] test_context = sys_test_context_task()
env_id = test_context[ENV_ID_KEY] role_arn = test_context[ROLE_ARN_KEY] bucket_name = "airflow-system-test-bedrock-eval" # TEST SETUP create_bucket = S3CreateBucketOperator( task_id="create_bucket", bucket_name=bucket_name, ) upload_dataset = S3CreateObjectOperator( task_id="upload_dataset", s3_bucket=bucket_name, s3_key="datasets/summarization.jsonl", data=EVAL_DATASET, ) # TEST BODY # [START howto_operator_bedrock_create_evaluation_job] create_evaluation_job = BedrockCreateEvaluationJobOperator( task_id="create_evaluation_job", job_name=f"{env_id}-eval", role_arn=role_arn, evaluation_config={ "automated": { "datasetMetricConfigs": [ { "taskType": "Summarization", "dataset": { "name": "eval-dataset", "datasetLocation": {"s3Uri": f"s3://{bucket_name}/datasets/summarization.jsonl"}, }, "metricNames": ["Builtin.Accuracy"], } ] } }, inference_config={ "models": [ { "bedrockModel": { "modelIdentifier": "us.anthropic.claude-haiku-4-5-20251001-v1:0", "inferenceParams": "{}", } } ] }, output_data_config={"s3Uri": f"s3://{bucket_name}/output/"}, ) # [END howto_operator_bedrock_create_evaluation_job] # TEST TEARDOWN @task(trigger_rule=TriggerRule.ALL_DONE) def stop_evaluation_job(job_arn: str): import contextlib import boto3 with contextlib.suppress(Exception): boto3.client("bedrock").stop_evaluation_job(jobIdentifier=job_arn) delete_bucket = S3DeleteBucketOperator( task_id="delete_bucket", bucket_name=bucket_name, force_delete=True, trigger_rule=TriggerRule.ALL_DONE, ) chain( # TEST SETUP test_context, create_bucket, upload_dataset, # TEST BODY create_evaluation_job, # TEST TEARDOWN stop_evaluation_job(create_evaluation_job.output), delete_bucket, ) from tests_common.test_utils.watcher import watcher list(dag.tasks) >> watcher() from tests_common.test_utils.system_tests import get_test_run # noqa: E402
[docs] test_run = get_test_run(dag)

Was this entry helpful?