diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..31c8f3d Binary files /dev/null and b/.DS_Store differ diff --git a/demo/.DS_Store b/demo/.DS_Store new file mode 100644 index 0000000..fc99d1f Binary files /dev/null and b/demo/.DS_Store differ diff --git a/docs/.DS_Store b/docs/.DS_Store new file mode 100644 index 0000000..96146a4 Binary files /dev/null and b/docs/.DS_Store differ diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..fa26f4d Binary files /dev/null and b/src/.DS_Store differ diff --git a/stepfunctions/.DS_Store b/stepfunctions/.DS_Store new file mode 100644 index 0000000..6f6161b Binary files /dev/null and b/stepfunctions/.DS_Store differ diff --git a/stepfunctions/.gitignore b/stepfunctions/.gitignore index 37833f8..c5aed14 100644 --- a/stepfunctions/.gitignore +++ b/stepfunctions/.gitignore @@ -4,6 +4,7 @@ __pycache__ .pytest_cache .venv *.egg-info +*.dist-info # CDK asset staging directory .cdk.staging diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE deleted file mode 100644 index 67db858..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE +++ /dev/null @@ -1,175 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA deleted file mode 100644 index 1070391..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA +++ /dev/null @@ -1,74 +0,0 @@ -Metadata-Version: 2.1 -Name: agent-evaluation -Version: 0.2.0 -Summary: A generative AI-powered framework for testing virtual agents. -Home-page: https://awslabs.github.io/agent-evaluation/ -Author: Amazon Web Services -Author-email: agent-evaluation-oss-core-team@amazon.com -License: Apache 2.0 -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: Topic :: Utilities -Classifier: Topic :: Software Development :: Testing -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3.11 -Classifier: Programming Language :: Python :: 3.12 -Requires-Python: >=3.9 -Description-Content-Type: text/markdown -License-File: LICENSE -License-File: NOTICE -Requires-Dist: pyyaml ~=6.0 -Requires-Dist: boto3 <2.0,>=1.34.20 -Requires-Dist: click ~=8.0 -Requires-Dist: pydantic <3.0,>=2.1.0 -Requires-Dist: rich <14.0,>=13.7.0 -Requires-Dist: jinja2 <4.0,>=3.1.3 -Requires-Dist: jsonpath-ng <2.0,>=1.6.1 -Provides-Extra: dev -Requires-Dist: flake8 ; extra == 'dev' -Requires-Dist: black ; extra == 'dev' -Requires-Dist: isort ; extra == 'dev' -Requires-Dist: pytest ; extra == 'dev' -Requires-Dist: pytest-cov ; extra == 'dev' -Requires-Dist: pytest-mock ; extra == 'dev' -Requires-Dist: mkdocs ; extra == 'dev' -Requires-Dist: mkdocs-material ; extra == 'dev' -Requires-Dist: mkdocstrings[python] ; extra == 'dev' -Requires-Dist: mkdocs-click ; extra == 'dev' -Requires-Dist: bandit ; extra == 'dev' -Requires-Dist: pip-audit ; extra == 'dev' - -![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) -![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) -[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) - -# Agent Evaluation - -Agent Evaluation is a generative AI-powered framework for testing virtual agents. - -Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. - -## ✨ Key features - -- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. -- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. -- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. -- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. - -## 📚 Documentation - -To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) - -## 👏 Contributors - -Shout out to these awesome contributors: - - - - diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE deleted file mode 100644 index 616fc58..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE +++ /dev/null @@ -1 +0,0 @@ -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD deleted file mode 100644 index fcc2eac..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD +++ /dev/null @@ -1,87 +0,0 @@ -../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 -agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 -agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 -agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 -agent_evaluation-0.2.0.dist-info/RECORD,, -agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 -agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 -agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 -agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 -agenteval/__pycache__/__init__.cpython-311.pyc,, -agenteval/__pycache__/cli.cpython-311.pyc,, -agenteval/__pycache__/conversation.cpython-311.pyc,, -agenteval/__pycache__/defaults.cpython-311.pyc,, -agenteval/__pycache__/hook.cpython-311.pyc,, -agenteval/__pycache__/plan.cpython-311.pyc,, -agenteval/__pycache__/target_response.cpython-311.pyc,, -agenteval/__pycache__/test.cpython-311.pyc,, -agenteval/__pycache__/test_result.cpython-311.pyc,, -agenteval/__pycache__/trace.cpython-311.pyc,, -agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 -agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 -agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 -agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 -agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, -agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, -agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 -agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 -agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, -agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 -agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 -agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 -agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 -agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 -agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 -agenteval/runner/__pycache__/__init__.cpython-311.pyc,, -agenteval/runner/__pycache__/runner.cpython-311.pyc,, -agenteval/runner/__pycache__/summary.cpython-311.pyc,, -agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 -agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 -agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 -agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 -agenteval/targets/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/__pycache__/base_target.cpython-311.pyc,, -agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, -agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, -agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 -agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 -agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 -agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 -agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 -agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 -agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 -agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, -agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 -agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 -agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 -agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 -agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 -agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 -agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 -agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 -agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 -agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 -agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 -agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 -agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 -agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 -agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 -agenteval/utils/__pycache__/__init__.cpython-311.pyc,, -agenteval/utils/__pycache__/aws.cpython-311.pyc,, -agenteval/utils/__pycache__/imports.cpython-311.pyc,, -agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 -agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL deleted file mode 100644 index bab98d6..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.43.0) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt deleted file mode 100644 index 6919bf1..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -agenteval = agenteval.cli:cli diff --git a/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt deleted file mode 100644 index 060c7ea..0000000 --- a/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -agenteval diff --git a/stepfunctions/agenteval/__init__.py b/stepfunctions/agenteval/__init__.py deleted file mode 100644 index cd7bf51..0000000 --- a/stepfunctions/agenteval/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib.metadata import version - -import logging -import os - -from jinja2 import Environment, PackageLoader, select_autoescape -from rich.logging import RichHandler - -from .hook import Hook -from .target_response import TargetResponse - -__all__ = ["Hook", "TargetResponse"] -__version__ = version("agent-evaluation") - - -_LOG_LEVEL_ENV = "LOG_LEVEL" - - -def configure_logger(): - # supress logs from botocore - logging.getLogger("botocore").setLevel(logging.CRITICAL) - - # configure logging using rich - formatter = logging.Formatter("%(message)s", datefmt="[%X]") - handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) - handler.setFormatter(formatter) - - logger = logging.getLogger(__name__) - - logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) - logger.addHandler(handler) - - -configure_logger() - -jinja_env = Environment( - loader=PackageLoader(__name__), - autoescape=select_autoescape( - disabled_extensions=["jinja"], - default_for_string=True, - default=True, - ), -) diff --git a/stepfunctions/agenteval/cli.py b/stepfunctions/agenteval/cli.py deleted file mode 100644 index 940f621..0000000 --- a/stepfunctions/agenteval/cli.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os -from typing import Optional - -import click - -from agenteval.plan import Plan -from agenteval.runner import Runner - -logger = logging.getLogger(__name__) - - -def validate_directory(directory): - if not os.path.isdir(directory): - raise NotADirectoryError(f"{directory} is not a directory") - if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): - raise PermissionError(f"No read/write permissions for {directory}") - - -@click.group() -def cli(): - pass - - -@cli.command(help="Initialize a test plan.") -@click.option( - "--plan-dir", - type=str, - required=False, - help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", -) -def init(plan_dir: Optional[str]): - if plan_dir: - validate_directory(plan_dir) - try: - path = Plan.init_plan(plan_dir) - logger.info(f"[green]Test plan created at {path}") - - except FileExistsError as e: - logger.error(f"[red]{e}") - exit(1) - - -@cli.command(help="Run test plan.") -@click.option( - "--filter", - type=str, - required=False, - help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", -) -@click.option( - "--plan-dir", - type=str, - required=False, - help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", -) -@click.option( - "--verbose", - is_flag=True, - type=bool, - default=False, - help="Controls the verbosity of the terminal logs.", -) -@click.option( - "--num-threads", - type=int, - required=False, - help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", -) -@click.option( - "--work-dir", - type=str, - required=False, - help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", -) -def run( - filter: Optional[str], - plan_dir: Optional[str], - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], -): - try: - plan = Plan.load(plan_dir, filter) - if work_dir: - validate_directory(work_dir) - runner = Runner( - plan, - verbose, - num_threads, - work_dir, - ) - num_failed = runner.run() - _num_failed_exit(num_failed) - - except Exception as e: - _exception_exit(e) - - -def _num_failed_exit(num_failed): - exit(1 if num_failed else 0) - - -def _exception_exit(e): - logger.exception(f"Error running test: {e}") - exit(1) diff --git a/stepfunctions/agenteval/conversation.py b/stepfunctions/agenteval/conversation.py deleted file mode 100644 index 59e4304..0000000 --- a/stepfunctions/agenteval/conversation.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -_USER = "USER" -_AGENT = "AGENT" -_START_TURN_COUNT = 0 - - -class Conversation: - """Captures the interaction between a user and an agent. - - Attributes: - messages (list): A list of tuples of the form (role, message). - turns (int): The number of turns in the conversation. - """ - - def __init__(self): - self.messages = [] - self.turns = _START_TURN_COUNT - - def __iter__(self): - """Allow iteration over conversation messages.""" - return iter(self.messages) - - def add_turn(self, user_message: str, agent_response: str): - """Record a turn in the conversation. - - Args: - user_message (str): The users's message - agent_response (str): The agent's response to the user's message - - Increments the `turn` counter by `1`. - """ - self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) - self.turns += 1 diff --git a/stepfunctions/agenteval/defaults.py b/stepfunctions/agenteval/defaults.py deleted file mode 100644 index 929c675..0000000 --- a/stepfunctions/agenteval/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MAX_TURNS = 2 - -# Default max number of threads not exceeding Bedrock service quota: -# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html -MAX_NUM_THREADS = 45 diff --git a/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/agenteval/evaluators/__init__.py deleted file mode 100644 index 8e52702..0000000 --- a/stepfunctions/agenteval/evaluators/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_evaluator import BaseEvaluator -from .evaluator_factory import EvaluatorFactory - -__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/agenteval/evaluators/base_evaluator.py deleted file mode 100644 index e1bd4c9..0000000 --- a/stepfunctions/agenteval/evaluators/base_evaluator.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -from abc import ABC, abstractmethod -from typing import Optional - -from agenteval.conversation import Conversation -from agenteval.hook import Hook -from agenteval.targets import BaseTarget -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace -from agenteval.utils import create_boto3_client, import_class - -_DEFAULT_MAX_RETRY = 10 -_BOTO3_SERVICE_NAME = "bedrock-runtime" - - -class BaseEvaluator(ABC): - """The `BaseEvaluator` abstract base class defines the common interface for evaluator - classes. - - Attributes: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - conversation (Conversation): Captures the interaction between a user and an agent. - trace (Trace): Captures steps during evaluation. - test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. - input_token_count (int): Number of input tokens processed by the evaluator. - output_token_count (int): Number of output tokens generated by the evaluator. - model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, - then this will be set to the ARN of the provisioned throughput. - boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. - """ - - def __init__( - self, - test: Test, - target: BaseTarget, - work_dir: str, - model_id: str, - provisioned_throughput_arn: Optional[str] = None, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """Initialize the evaluator instance for a given `Test` and `Target`. - - Args: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - work_dir (str): The work directory. - model_id (str): The ID of the Bedrock model used to run evaluation. - provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - self.test = test - self.target = target - self.conversation = Conversation() - self.trace = Trace(work_dir=work_dir, test_name=test.name) - self.test_result = None - self.input_token_count = 0 - self.output_token_count = 0 - self.model_id = provisioned_throughput_arn or model_id - self.bedrock_runtime_client = create_boto3_client( - boto3_service_name=_BOTO3_SERVICE_NAME, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) - - @abstractmethod - def evaluate(self) -> TestResult: - """Conduct a test. - - Returns: - TestResult: The result of the test. - """ - pass - - def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: - if hook: - hook_cls = import_class(hook, parent_class=Hook) - return hook_cls - - def invoke_model(self, request_body: dict) -> dict: - """ - Invoke the Bedrock model using the `boto3_client`. This method will convert - a request dictionary to a JSON string before passing it to the `InvokeModel` API. - - Refer to the `boto3` documentation for more details. - - Args: - request_body (dict): The request payload as a dictionary. - - Returns: - dict: The response from the model invocation. - - """ - response = self.bedrock_runtime_client.invoke_model( - modelId=self.model_id, body=json.dumps(request_body) - ) - - self._incr_token_counts(response) - - return response - - def _incr_token_counts(self, response: dict): - headers = response["ResponseMetadata"]["HTTPHeaders"] - - self.input_token_count += int( - headers.get("x-amzn-bedrock-input-token-count", 0) - ) - self.output_token_count += int( - headers.get("x-amzn-bedrock-output-token-count", 0) - ) - - def run(self) -> TestResult: - """ - Run the evaluator within a trace context manager and run hooks - if provided. - """ - - hook_cls = self._get_hook_cls(self.test.hook) - - with self.trace: - if hook_cls: - hook_cls.pre_evaluate(self.test, self.trace) - self.test_result = self.evaluate() - if hook_cls: - hook_cls.post_evaluate(self.test, self.test_result, self.trace) - - return self.test_result diff --git a/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/agenteval/evaluators/claude_3/__init__.py deleted file mode 100644 index 338be7d..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .evaluator import Claude3Evaluator - -__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/agenteval/evaluators/claude_3/evaluator.py deleted file mode 100644 index cc8b3ae..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/evaluator.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import logging -import os -import re -from typing import Tuple - -from agenteval import jinja_env -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import model_configs -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" -_SYSTEM_PROMPT_DIR = "system" -_PROMPT_TEMPLATE_NAMES = [ - "generate_initial_prompt", - "generate_user_response", - "generate_test_status", - "generate_evaluation", -] - -# enable backwards-compatible StrEnum -try: - from enum import StrEnum -except ImportError: - from enum import Enum - - class StrEnum(str, Enum): - pass - - -class TestStatusCategories(StrEnum): - ALL_STEPS_ATTEMPTED = "A" - NOT_ALL_STEPS_ATTEMPTED = "B" - - -class EvaluationCategories(StrEnum): - ALL_EXPECTED_RESULTS_OBSERVED = "A" - NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" - - -class Results(StrEnum): - MAX_TURNS_REACHED = "Maximum turns reached." - ALL_EXPECTED_RESULTS_OBSERVED = ( - "All of the expected results can be observed in the conversation." - ) - NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( - "Not all of the expected results can be observed in the conversation." - ) - - -class Claude3Evaluator(BaseEvaluator): - def __init__( - self, - **kwargs, - ): - super().__init__(model_id=model_configs.MODEL_ID, **kwargs) - - self._prompt_template_map = { - name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") - ), - } - for name in _PROMPT_TEMPLATE_NAMES - } - - @staticmethod - def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: - content = [] - for e in element_names: - pattern = rf"<{e}>(.*?)" - match = re.search(pattern, xml_data, re.DOTALL) - content.append(match.group(1).strip() if match else None) - return tuple(content) - - def _generate( - self, - system_prompt: str, - prompt: str, - output_xml_element: str, - ) -> str: - request_body = model_configs.REQUEST_BODY - request_body["system"] = system_prompt - request_body["messages"][0]["content"][0]["text"] = prompt - - response = self.invoke_model(request_body=request_body) - response_body = response.get("body").read() - completion = json.loads(response_body)["content"][0]["text"] - - logger.debug( - f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" - ) - - output, reasoning = self._extract_content_from_xml( - completion, [output_xml_element, "thinking"] - ) - - return output, reasoning - - def _generate_initial_prompt(self) -> str: - system_prompt = self._prompt_template_map["generate_initial_prompt"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( - step=self.test.steps[0] - ) - - initial_prompt, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="initial_prompt", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - initial_prompt=initial_prompt, - reasoning=reasoning, - ) - return initial_prompt - - def _generate_test_status(self) -> str: - system_prompt = self._prompt_template_map["generate_test_status"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - test_status, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - test_status=test_status, - reasoning=reasoning, - ) - return test_status - - def _generate_evaluation(self) -> tuple[str, str]: - system_prompt = self._prompt_template_map["generate_evaluation"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( - expected_results=self.test.expected_results, - conversation=self.conversation, - ) - - evaluation, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - evaluation=evaluation, - reasoning=reasoning, - ) - - return evaluation, reasoning - - def _generate_user_response(self) -> str: - system_prompt = self._prompt_template_map["generate_user_response"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - - user_response, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="user_response", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - user_response=user_response, - reasoning=reasoning, - ) - return user_response - - def _invoke_target(self, user_input) -> str: - target_response = self.target.invoke(user_input) - self.trace.add_step(data=target_response.data) - - return target_response.response - - def evaluate(self) -> TestResult: - success = False - result = Results.MAX_TURNS_REACHED.value - reasoning = "" - - while self.conversation.turns < self.test.max_turns: - if self.conversation.turns == 0: - # start conversation - if self.test.initial_prompt: - user_input = self.test.initial_prompt - else: - user_input = self._generate_initial_prompt() - else: - # generate next user response - user_input = self._generate_user_response() - - # add turn to the conversation - self.conversation.add_turn(user_input, self._invoke_target(user_input)) - - # get test status - test_status = self._generate_test_status() - if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: - # evaluate conversation - eval_category, reasoning = self._generate_evaluation() - if ( - eval_category - == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 - ): - result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value - else: - result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value - success = True - - break - - return TestResult( - test_name=self.test.name, - success=success, - result=result, - reasoning=reasoning, - conversation=self.conversation, - ) diff --git a/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/agenteval/evaluators/claude_3/model_configs.py deleted file mode 100644 index e6bc2fc..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/model_configs.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" -ANTHROPIC_VERSION = "bedrock-2023-05-31" -ROLE = "user" -MAX_TOKENS_TO_SAMPLE = 300 -TEMPERATURE = 0 -TOP_K = 250 -TOP_P = 1 -REQUEST_BODY = { - "anthropic_version": ANTHROPIC_VERSION, - "max_tokens": MAX_TOKENS_TO_SAMPLE, - "system": None, - "messages": [ - { - "role": ROLE, - "content": [ - {"type": "text", "text": None}, - ], - } - ], - "temperature": TEMPERATURE, - "top_p": TOP_P, - "top_k": TOP_K, -} diff --git a/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt deleted file mode 100644 index fce3738..0000000 --- a/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt +++ /dev/null @@ -1,67 +0,0 @@ - -You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. - - - - - - - -You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. - - - - -Your approach is to: -1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. -2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. - -Some example questions include: - - - -* Does the customer have a smart meter? -* Are they aware of Energy Hub? -* Are they on the right tariff? -* How many people are in their household? -* What is their current living situation (apartment, house, etc.)? - - - -Some examples of recommendations include: - - -* Smart meter installation for better usage monitoring -* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount - - - -Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. - - -Here is an example interaction: - - -A: how can I reduce my energy bill? - -B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? - -A: Yes - -B: What kind of house do you live in and with how many people? - -A: I live in a one-bedroom apartment with my partner? - -B: Thank you, and how do you measure your energy use? - -A: I send meter readings? - -B: Okay, so to confirm you don’t have a smart meter? - -A: No - -B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? -... -[continues dialogue to gather more details if required and then provide a personalized recommendation] - - diff --git a/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/agenteval/evaluators/evaluator_factory.py deleted file mode 100644 index d42f8e3..0000000 --- a/stepfunctions/agenteval/evaluators/evaluator_factory.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import Claude3Evaluator -from agenteval.targets import BaseTarget -from agenteval.test import Test - -_EVALUATOR_MAP = { - "claude-3": Claude3Evaluator, -} - - -class EvaluatorFactory(BaseModel): - config: dict - - def create( - self, test: Test, target: BaseTarget, work_dir: Optional[str] - ) -> BaseEvaluator: - evaluator_cls = _EVALUATOR_MAP[self.config["model"]] - return evaluator_cls( - test=test, - target=target, - work_dir=work_dir, - **{k: v for k, v in self.config.items() if k != "model"} - ) diff --git a/stepfunctions/agenteval/hook.py b/stepfunctions/agenteval/hook.py deleted file mode 100644 index a1386e6..0000000 --- a/stepfunctions/agenteval/hook.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace - - -class Hook: - """An evaluation hook.""" - - def pre_evaluate(test: Test, trace: Trace) -> None: - """ - Method called before evaluation. Can be used to perform any setup tasks. - - Args: - test (Test): The test case. - trace (Trace): Captures steps during evaluation. - """ - pass - - def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: - """ - Method called after evaluation. This may be used to perform integration testing - or clean up tasks. - - Args: - test (Test): The test case. - test_result (TestResult): The result of the test, which can be overriden - by updating the attributes of this object. - trace (Trace): Captures steps during evaluation. - """ - pass diff --git a/stepfunctions/agenteval/plan.py b/stepfunctions/agenteval/plan.py deleted file mode 100644 index 73a3107..0000000 --- a/stepfunctions/agenteval/plan.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import logging -import os -import sys -from typing import Optional - -import yaml -from pydantic import BaseModel, model_validator - -from agenteval import defaults -from agenteval.evaluators import EvaluatorFactory -from agenteval.targets import TargetFactory -from agenteval.test import Test - -_PLAN_FILE_NAME = "agenteval.yml" - -_INIT_PLAN = { - "evaluator": {"model": "claude-3"}, - "target": { - "type": "bedrock-agent", - "bedrock_agent_id": None, - "bedrock_agent_alias_id": None, - }, - "tests": { - "retrieve_missing_documents": { - "steps": ["Ask agent for a list of missing documents for claim-006."], - "expected_results": ["The agent returns a list of missing documents."], - } - }, -} - - -sys.path.append(".") -logger = logging.getLogger(__name__) - - -class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): - evaluator_factory: EvaluatorFactory - target_factory: TargetFactory - tests: list[Test] - - @model_validator(mode="after") - def check_test_names_unique(self) -> Plan: - unique_names = len(set(test.name for test in self.tests)) - - if unique_names != len(self.tests): - raise ValueError("Test names must be unique") - - return self - - @classmethod - def load(cls, plan_dir: Optional[str], filter: str) -> Plan: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - plan = cls._load_yaml(plan_path) - - return cls( - evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), - target_factory=TargetFactory(config=plan["target"]), - tests=cls._load_tests(plan["tests"], filter), - ) - - @staticmethod - def _load_yaml(path: str) -> dict: - with open(path) as stream: - return yaml.safe_load(stream) - - @staticmethod - def _load_tests(test_config: list[dict], filter: str) -> list[Test]: - tests = [] - - if filter: - names = Plan._parse_filter(filter) - else: - names = test_config.keys() - - for name in names: - config = test_config[name] - tests.append( - Test( - name=name, - steps=config["steps"], - expected_results=config["expected_results"], - initial_prompt=config.get("initial_prompt"), - max_turns=config.get("max_turns", defaults.MAX_TURNS), - hook=config.get("hook"), - ) - ) - - return tests - - @staticmethod - def _parse_filter(filter: str) -> list[str]: - return [n.strip() for n in filter.split(",")] - - @staticmethod - def init_plan(plan_dir: Optional[str]) -> str: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - - # check if plan exists - if os.path.exists(plan_path): - raise FileExistsError(f"Test plan already exists at {plan_path}") - - with open(plan_path, "w") as stream: - yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) - - return plan_path diff --git a/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/agenteval/runner/__init__.py deleted file mode 100644 index 32377b3..0000000 --- a/stepfunctions/agenteval/runner/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .runner import Runner - -__all__ = ["Runner"] diff --git a/stepfunctions/agenteval/runner/runner.py b/stepfunctions/agenteval/runner/runner.py deleted file mode 100644 index c3e0803..0000000 --- a/stepfunctions/agenteval/runner/runner.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import concurrent.futures -import logging -import os -import time -from typing import Optional - -from rich.progress import Progress - -from agenteval.defaults import MAX_NUM_THREADS -from agenteval.plan import Plan -from agenteval.runner.summary import create_markdown_summary - -logger = logging.getLogger(__name__) - - -class Runner: - def __init__( - self, - plan: Plan, - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], - ): - self.plan = plan - self.work_dir = work_dir if work_dir else os.getcwd() - self.num_tests = len(self.plan.tests) - self.verbose = verbose - self.num_threads = num_threads - if not self.num_threads: - self.num_threads = min(self.num_tests, MAX_NUM_THREADS) - self.results = {test.name: None for test in self.plan.tests} - self.num_failed = 0 - self.evaluator_input_token_counts = [] - self.evaluator_output_token_counts = [] - - def run(self) -> int: - self._log_run_start() - - self.start_time = time.time() - with Progress(transient=True) as self.progress: - self.tracker = self.progress.add_task("running...", total=self.num_tests) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.num_tests - ) as executor: - futures = [ - executor.submit(self.run_test, test) for test in self.plan.tests - ] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - raise e - - self._log_run_end() - - create_markdown_summary( - self.work_dir, self.plan.tests, list(self.results.values()), self.verbose - ) - - return self.num_failed - - def run_test(self, test): - target = self.plan.target_factory.create() - evaluator = self.plan.evaluator_factory.create( - test=test, - target=target, - work_dir=self.work_dir, - ) - - result = evaluator.run() - if result.success is False: - self.num_failed += 1 - - self.progress.update(self.tracker, advance=1) - self.results[test.name] = result - self.evaluator_input_token_counts.append(evaluator.input_token_count) - self.evaluator_output_token_counts.append(evaluator.output_token_count) - - def _log_run_start(self): - logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") - - def _log_run_end(self): - self._log_pass_fail_count() - logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") - if self.verbose: - self._log_test_result() - self._log_evaluator_token_io() - - def _log_test_result(self): - for _, result in self.results.items(): - logger_func = logger.info if result.success else logger.error - logger_func( - f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", - ) - - def _log_pass_fail_count(self): - passed_count = self.num_tests - self.num_failed - status_str = ( - f"[red]{passed_count} passed, {self.num_failed} failed." - if self.num_failed - else f"[green]{self.num_tests} passed." - ) - logger_func = logger.error if self.num_failed else logger.info - logger_func(status_str) - - def _log_evaluator_token_io(self): - logger.info( - f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" - ) - logger.info( - f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" - ) diff --git a/stepfunctions/agenteval/runner/summary.py b/stepfunctions/agenteval/runner/summary.py deleted file mode 100644 index 1abfaad..0000000 --- a/stepfunctions/agenteval/runner/summary.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os - -from agenteval import jinja_env -from agenteval.test import Test -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_TEMPLATE_ROOT = "summary" -_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" - - -def create_markdown_summary( - work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool -): - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) - - rendered = template.render(tests=tests, results=test_results, zip=zip) - - with open(summary_path, "w+") as f: - f.write(rendered) - - if verbose: - logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/agenteval/target_response.py b/stepfunctions/agenteval/target_response.py deleted file mode 100644 index 417543f..0000000 --- a/stepfunctions/agenteval/target_response.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class TargetResponse(BaseModel): - """A target's response. - - Attributes: - response: The response string. - data: Additional data (if applicable). - """ - - response: str - data: Optional[dict] = None diff --git a/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/agenteval/targets/__init__.py deleted file mode 100644 index 910e303..0000000 --- a/stepfunctions/agenteval/targets/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_target import BaseTarget -from .boto3_target import Boto3Target -from .target_factory import TargetFactory - -__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/agenteval/targets/base_target.py deleted file mode 100644 index f8fbaa8..0000000 --- a/stepfunctions/agenteval/targets/base_target.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from agenteval import TargetResponse - - -class BaseTarget(ABC): - """The `BaseTarget` abstract base class defines the common interface for target - classes. - """ - - @abstractmethod - def invoke(self, prompt: str) -> TargetResponse: - """Invoke the target with a prompt and return a response as a string. - - Args: - prompt: The prompt string to pass to the target. - - Returns: - A TargetResponse object containing the target's response string and - any trace data (if applicable). - """ - pass diff --git a/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/agenteval/targets/bedrock_agent/__init__.py deleted file mode 100644 index 4d393ff..0000000 --- a/stepfunctions/agenteval/targets/bedrock_agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockAgentTarget - -__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/agenteval/targets/bedrock_agent/target.py deleted file mode 100644 index f7e6f9c..0000000 --- a/stepfunctions/agenteval/targets/bedrock_agent/target.py +++ /dev/null @@ -1,41 +0,0 @@ -import uuid - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockAgentTarget(Boto3Target): - def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - self._bedrock_agent_id = bedrock_agent_id - self._bedrock_agent_alias_id = bedrock_agent_alias_id - self._session_id: str = str(uuid.uuid4()) - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "agentId": self._bedrock_agent_id, - "agentAliasId": self._bedrock_agent_alias_id, - "sessionId": self._session_id, - "inputText": prompt, - "enableTrace": True, - } - - response = self.boto3_client.invoke_agent(**args) - - stream = response["completion"] - completion = "" - trace_data = [] - - for event in stream: - chunk = event.get("chunk") - event_trace = event.get("trace") - if chunk: - completion += chunk.get("bytes").decode() - if event_trace: - trace_data.append(event_trace.get("trace")) - - return TargetResponse( - response=completion, data={"bedrock_agent_trace": trace_data} - ) diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py deleted file mode 100644 index d56ea6f..0000000 --- a/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockKnowledgeBaseTarget - -__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py deleted file mode 100644 index a9491e2..0000000 --- a/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py +++ /dev/null @@ -1,38 +0,0 @@ -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockKnowledgeBaseTarget(Boto3Target): - def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - aws_region = self.boto3_client.meta.region_name - self._knowledge_base_id = knowledge_base_id - self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" - self._session_id: str = None - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "input": { - "text": prompt, - }, - "retrieveAndGenerateConfiguration": { - "type": "KNOWLEDGE_BASE", - "knowledgeBaseConfiguration": { - "knowledgeBaseId": self._knowledge_base_id, - "modelArn": self._model_arn, - }, - }, - } - if self._session_id: - args["sessionId"] = self._session_id - - response = self.boto3_client.retrieve_and_generate(**args) - generated_text = response["output"]["text"] - citations = response["citations"] - self._session_id = response["sessionId"] - - return TargetResponse( - response=generated_text, data={"bedrock_knowledgebase_citations": citations} - ) diff --git a/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/agenteval/targets/boto3_target.py deleted file mode 100644 index e47e8cb..0000000 --- a/stepfunctions/agenteval/targets/boto3_target.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Optional - -from agenteval.targets import BaseTarget -from agenteval.utils import create_boto3_client - -_DEFAULT_MAX_RETRY = 10 - - -class Boto3Target(BaseTarget): - """A target that can be interfaced with via the `boto3` library. - - Attributes: - boto3_client (BaseClient): A `boto3` client. - """ - - def __init__( - self, - boto3_service_name: str, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """ - Initialize the AWS target. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - - self.boto3_client = create_boto3_client( - boto3_service_name=boto3_service_name, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) diff --git a/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/agenteval/targets/q_business/__init__.py deleted file mode 100644 index 3f621e5..0000000 --- a/stepfunctions/agenteval/targets/q_business/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import QBusinessTarget - -__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/agenteval/targets/q_business/target.py deleted file mode 100644 index 8fd59be..0000000 --- a/stepfunctions/agenteval/targets/q_business/target.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Optional - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "qbusiness" - - -class QBusinessTarget(Boto3Target): - def __init__( - self, - q_business_application_id: str, - q_business_user_id: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._chat_sync_args = {"applicationId": q_business_application_id} - if q_business_user_id: - self._chat_sync_args["userId"] = q_business_user_id - - def invoke(self, prompt: str) -> str: - self._chat_sync_args["userMessage"] = prompt - - response = self.boto3_client.chat_sync(**self._chat_sync_args) - - if "conversationId" not in self._chat_sync_args: - self._chat_sync_args["conversationId"] = response["conversationId"] - - self._chat_sync_args["parentMessageId"] = response["systemMessageId"] - - return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py deleted file mode 100644 index 8c9adc2..0000000 --- a/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import SageMakerEndpointTarget - -__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py deleted file mode 100644 index 74d2056..0000000 --- a/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -from typing import Optional - -from jsonpath_ng import parse - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "sagemaker-runtime" -_CONTENT_TYPE = "application/json" -_ACCEPT = "application/json" - - -class SageMakerEndpointTarget(Boto3Target): - def __init__( - self, - endpoint_name: str, - request_body: dict, - input_path: str, - output_path: str, - custom_attributes: Optional[str] = None, - target_model: Optional[str] = None, - target_variant: Optional[str] = None, - target_container_hostname: Optional[str] = None, - inference_component_name: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._request_body = request_body - self._input_jp_expr = parse(input_path) - self._output_jp_expr = parse(output_path) - - self._args = self._create_base_args( - endpoint_name, - custom_attributes, - target_model, - target_variant, - target_container_hostname, - inference_component_name, - ) - - @staticmethod - def _create_base_args( - endpoint_name: str, - custom_attributes: Optional[str], - target_model: Optional[str], - target_variant: Optional[str], - target_container_hostname: Optional[str], - inference_component_name: Optional[str], - ): - args = { - "EndpointName": endpoint_name, - "ContentType": _CONTENT_TYPE, - "Accept": _ACCEPT, - **{ - key: value - for key, value in { - "CustomAttributes": custom_attributes, - "TargetModel": target_model, - "TargetVariant": target_variant, - "TargetContainerHostname": target_container_hostname, - "InferenceComponentName": inference_component_name, - }.items() - if value is not None - }, - } - - return args - - def _update_request(self, prompt: str): - self._input_jp_expr.update(self._request_body, prompt) - self._args["Body"] = json.dumps(self._request_body) - - def _query_response(self, response_body: dict) -> str: - return self._output_jp_expr.find(response_body)[0].value - - def invoke(self, prompt: str) -> str: - self._update_request(prompt) - - response = self.boto3_client.invoke_endpoint(**self._args) - - response_body = json.loads(response.get("Body").read()) - - return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/agenteval/targets/target_factory.py deleted file mode 100644 index a8e7e9c..0000000 --- a/stepfunctions/agenteval/targets/target_factory.py +++ /dev/null @@ -1,32 +0,0 @@ -from pydantic import BaseModel - -from agenteval.targets import BaseTarget -from agenteval.targets.bedrock_agent import BedrockAgentTarget -from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget -from agenteval.targets.q_business import QBusinessTarget -from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget -from agenteval.utils import import_class - -_TARGET_MAP = { - "bedrock-agent": BedrockAgentTarget, - "q-business": QBusinessTarget, - "sagemaker-endpoint": SageMakerEndpointTarget, - "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, -} - - -class TargetFactory(BaseModel): - config: dict - - def create(self) -> BaseTarget: - target_cls = self._get_target_class() - - return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) - - def _get_target_class(self) -> type[BaseTarget]: - if self.config["type"] in _TARGET_MAP: - target_cls = _TARGET_MAP[self.config["type"]] - else: - target_cls = import_class(self.config["type"], parent_class=BaseTarget) - - return target_cls diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja deleted file mode 100644 index 9cd9dd4..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the expected results and conversation: - - -{% for result in expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja deleted file mode 100644 index 832ba37..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja +++ /dev/null @@ -1,5 +0,0 @@ -Here is the step: - - -{{ step }} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja deleted file mode 100644 index 22cace3..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja +++ /dev/null @@ -1,12 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -Your job is to analyze the conversation in tags and a list of expected results -in tags. - -You will classify the the conversation into the following categories: - -- A: All of the expected results can be observed in the conversation. -- B: Not all of the expected results can be observed in the conversation. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja deleted file mode 100644 index d0e8e23..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given a step that is wrapped in tags. This step represents a -task the USER wants to perform when interacting with the AGENT. - -Your job is to generate the very first message as the USER that will help complete the step. - -Make sure this message is concise and to the point. - -Do not provide any information if it is expected that the AGENT will eventually ask for it. - -Please think hard about the response in tags before providing the message -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja deleted file mode 100644 index 7bb8e6b..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents a task -that the USER wants to perform when interacting with the AGENT. - -Your job is analyze the running conversation in tags and classify it into the following -categories: - -- A: The USER has attempted all the steps. -- B: The USER has not yet attempted all the steps. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja deleted file mode 100644 index e670420..0000000 --- a/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja +++ /dev/null @@ -1,15 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents -a task that the USER wants to perform when interacting with the AGENT. - -Using the list of steps, your job is analyze the running conversation in the - tags and generate the next appropriate response as the USER. - -Do not include any information from a step unless the AGENT asks for it. - -If the AGENT was unable to help or did not understand the last request, just move on to -the next step. Do not attempt to rephrase the request in the next response as the USER. - -Please think hard about the response in tags before providing the response -within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja deleted file mode 100644 index a624303..0000000 --- a/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja +++ /dev/null @@ -1,49 +0,0 @@ -# Test Summary ---- -This document provides a summary of the tests executed by Agent Evaluation. - -> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** ---- -## Tests -{% for test, result in zip(tests, results) -%} -- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) -{% endfor %} - ---- - - -{% for test, result in zip(tests, results) -%} -## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} - -**Steps** -{% for step in test.steps -%} -{{ loop.index }}. {{ step }} -{% endfor %} - -**Expected results** -{% for result in test.expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor %} - -**Conversation** -``` -{% for sender, message in result.conversation -%} -[{{ sender }}] {{ message }} -{% endfor -%} -``` - -**Result** -{{ result.result }} - -**Reasoning** -``` -{{ result.reasoning }} -``` - ---- -{% endfor %} - - - - - diff --git a/stepfunctions/agenteval/test.py b/stepfunctions/agenteval/test.py deleted file mode 100644 index 695f2fe..0000000 --- a/stepfunctions/agenteval/test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -from pydantic import BaseModel - - -class Test(BaseModel, validate_assignment=True): - """A test case for an agent. - - Attributes: - name: Name of the test. - steps: List of step to perform for the test. - expected_results: List of expected results for the test. - initial_prompt: Optional initial prompt. - max_turns: Maximum number of turns allowed for the test. - hook: The module path to an evaluation hook. - """ - - # do not collect as a test - __test__ = False - - name: str - steps: list[str] - expected_results: list[str] - initial_prompt: Optional[str] = None - max_turns: int - hook: Optional[str] = None diff --git a/stepfunctions/agenteval/test_result.py b/stepfunctions/agenteval/test_result.py deleted file mode 100644 index 5258aef..0000000 --- a/stepfunctions/agenteval/test_result.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from pydantic import BaseModel - -from agenteval.conversation import Conversation - - -class TestResult(BaseModel, arbitrary_types_allowed=True): - """The result of a test. - - Attributes: - test_name: Name of the test. - result: Description of the test result. - reasoning: The rationale for the test result. - success: `True` if the test passed, otherwise `False`. - conversation: Captures the interaction between a user and an agent. - """ - - # do not collect as a test - __test__ = False - - test_name: str - result: str - reasoning: str - success: bool - conversation: Conversation diff --git a/stepfunctions/agenteval/trace.py b/stepfunctions/agenteval/trace.py deleted file mode 100644 index 25d477a..0000000 --- a/stepfunctions/agenteval/trace.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import inspect -import json -import os -from datetime import datetime, timezone -from typing import Optional - -_TRACE_DIR = "agenteval_traces" - - -class Trace: - """Captures steps during evaluation. - - Attributes: - test_name (str): Name of the test. - trace_dir (str): Directory to store the trace. - start_time (datetime): Start time of the trace. - end_time (datetime): End time of the trace. - steps (list): List of steps in the trace. - - """ - - def __init__(self, test_name: str, work_dir: str): - """ - Initialize the trace handler. - - Args: - test_name (str): Name of the trace - """ - self.test_name = test_name - self.trace_dir = os.path.join(work_dir, _TRACE_DIR) - self.start_time = None - self.end_time = None - self.steps = [] - - def __enter__(self): - self.start_time = datetime.now(timezone.utc) - return self - - def __exit__(self, *exc): - self.end_time = datetime.now(timezone.utc) - self._dump_trace() - - def _dump_trace(self): - """Dump the trace to a JSON file.""" - - os.makedirs(self.trace_dir, exist_ok=True) - - with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: - json.dump(self._get_trace(), f, default=str) - - def _get_trace(self) -> str: - return { - "test_name": self.test_name, - "start_time": self.start_time, - "end_time": self.end_time, - "steps": self.steps, - } - - def add_step(self, step_name: Optional[str] = None, **kwargs): - """Add a step to the trace. - - Args: - step_name (str, optional): The name of the step. Defaults to - the name of the caller function - """ - step_name = step_name or inspect.stack()[1].function - step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} - step.update(kwargs) - self.steps.append(step) diff --git a/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/agenteval/utils/__init__.py deleted file mode 100644 index 5f80a10..0000000 --- a/stepfunctions/agenteval/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .aws import create_boto3_client -from .imports import import_class - -__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/agenteval/utils/aws.py b/stepfunctions/agenteval/utils/aws.py deleted file mode 100644 index 4d5d4dd..0000000 --- a/stepfunctions/agenteval/utils/aws.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -import boto3 -from botocore.client import BaseClient -from botocore.config import Config - -_RETRY_MODE = "adaptive" - - -def create_boto3_client( - boto3_service_name: str, - aws_profile: Optional[str], - aws_region: Optional[str], - endpoint_url: Optional[str], - max_retry: int, -) -> BaseClient: - """Create a `boto3` client. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - - Returns: - BaseClient - """ - - config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) - - session = boto3.Session(profile_name=aws_profile, region_name=aws_region) - return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/agenteval/utils/imports.py b/stepfunctions/agenteval/utils/imports.py deleted file mode 100644 index f0e2685..0000000 --- a/stepfunctions/agenteval/utils/imports.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib import import_module -from typing import Optional - -_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] - - -def import_class(module_path: str, parent_class: Optional[type] = None) -> type: - name, class_name = module_path.rsplit(".", 1) - - # make sure module name starts with one of the allowed suffixes - _validate_module_name(name.split(".")[-1]) - - module = import_module(name) - cls = getattr(module, class_name) - - if parent_class: - # make sure the imported class is a subclass - _validate_subclass(cls, parent_class) - - return cls - - -def _validate_module_name(name: str) -> None: - if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): - raise ValueError(f"Invalid module name: {name}") - - -def _validate_subclass(child_class: type, parent_class: type) -> None: - if not issubclass(child_class, parent_class): - raise TypeError( - f"{child_class.__name__} is not a {parent_class.__name__} subclass" - ) diff --git a/stepfunctions/app.py b/stepfunctions/app.py index 5751ac6..bf4ff67 100644 --- a/stepfunctions/app.py +++ b/stepfunctions/app.py @@ -20,7 +20,7 @@ # Uncomment the next line if you know exactly what Account and Region you # want to deploy the stack to. */ - env=cdk.Environment(region='us-east-1'), + # env=cdk.Environment(region='us-east-1'), # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html ) diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE deleted file mode 100644 index 67db858..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/LICENSE +++ /dev/null @@ -1,175 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA deleted file mode 100644 index 1070391..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/METADATA +++ /dev/null @@ -1,74 +0,0 @@ -Metadata-Version: 2.1 -Name: agent-evaluation -Version: 0.2.0 -Summary: A generative AI-powered framework for testing virtual agents. -Home-page: https://awslabs.github.io/agent-evaluation/ -Author: Amazon Web Services -Author-email: agent-evaluation-oss-core-team@amazon.com -License: Apache 2.0 -Classifier: Development Status :: 4 - Beta -Classifier: Intended Audience :: Developers -Classifier: Topic :: Utilities -Classifier: Topic :: Software Development :: Testing -Classifier: License :: OSI Approved :: Apache Software License -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3.11 -Classifier: Programming Language :: Python :: 3.12 -Requires-Python: >=3.9 -Description-Content-Type: text/markdown -License-File: LICENSE -License-File: NOTICE -Requires-Dist: pyyaml ~=6.0 -Requires-Dist: boto3 <2.0,>=1.34.20 -Requires-Dist: click ~=8.0 -Requires-Dist: pydantic <3.0,>=2.1.0 -Requires-Dist: rich <14.0,>=13.7.0 -Requires-Dist: jinja2 <4.0,>=3.1.3 -Requires-Dist: jsonpath-ng <2.0,>=1.6.1 -Provides-Extra: dev -Requires-Dist: flake8 ; extra == 'dev' -Requires-Dist: black ; extra == 'dev' -Requires-Dist: isort ; extra == 'dev' -Requires-Dist: pytest ; extra == 'dev' -Requires-Dist: pytest-cov ; extra == 'dev' -Requires-Dist: pytest-mock ; extra == 'dev' -Requires-Dist: mkdocs ; extra == 'dev' -Requires-Dist: mkdocs-material ; extra == 'dev' -Requires-Dist: mkdocstrings[python] ; extra == 'dev' -Requires-Dist: mkdocs-click ; extra == 'dev' -Requires-Dist: bandit ; extra == 'dev' -Requires-Dist: pip-audit ; extra == 'dev' - -![PyPI - Version](https://img.shields.io/pypi/v/agent-evaluation) -![PyPI - Python Version](https://img.shields.io/pypi/pyversions/agent-evaluation) -![GitHub License](https://img.shields.io/github/license/awslabs/agent-evaluation) -[![security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Built with Material for MkDocs](https://img.shields.io/badge/Material_for_MkDocs-526CFE?style=for-the-badge&logo=MaterialForMkDocs&logoColor=white)](https://squidfunk.github.io/mkdocs-material/) - -# Agent Evaluation - -Agent Evaluation is a generative AI-powered framework for testing virtual agents. - -Internally, Agent Evaluation implements an LLM agent (evaluator) that will orchestrate conversations with your own agent (target) and evaluate the responses during the conversation. - -## ✨ Key features - -- Built-in support for popular AWS services including [Amazon Bedrock](https://aws.amazon.com/bedrock/), [Amazon Q Business](https://aws.amazon.com/q/business/), and [Amazon SageMaker](https://aws.amazon.com/sagemaker/). You can also [bring your own agent](https://awslabs.github.io/agent-evaluation/targets/custom_targets/) to test using Agent Evaluation. -- Orchestrate concurrent, multi-turn conversations with your agent while evaluating its responses. -- Define [hooks](https://awslabs.github.io/agent-evaluation/hooks/) to perform additional tasks such as integration testing. -- Can be incorporated into CI/CD pipelines to expedite the time to delivery while maintaining the stability of agents in production environments. - -## 📚 Documentation - -To get started, please visit the full documentation [here](https://awslabs.github.io/agent-evaluation/). To contribute, please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) - -## 👏 Contributors - -Shout out to these awesome contributors: - - - - diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE deleted file mode 100644 index 616fc58..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/NOTICE +++ /dev/null @@ -1 +0,0 @@ -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD deleted file mode 100644 index fcc2eac..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/RECORD +++ /dev/null @@ -1,87 +0,0 @@ -../../../bin/agenteval,sha256=sKahy-HYfncxw3pVqCLLgxIokhvln3Qm9eDSvskMrV8,250 -agent_evaluation-0.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -agent_evaluation-0.2.0.dist-info/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142 -agent_evaluation-0.2.0.dist-info/METADATA,sha256=WOBzlzyr7ozBQpU_A99iEq8w2O-j-Zii-Q2al9A3D_Y,3759 -agent_evaluation-0.2.0.dist-info/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67 -agent_evaluation-0.2.0.dist-info/RECORD,, -agent_evaluation-0.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -agent_evaluation-0.2.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92 -agent_evaluation-0.2.0.dist-info/entry_points.txt,sha256=DxxfiIbldqK82WRgaMOL4BjPJcnr7JOkDYTch6xNahs,48 -agent_evaluation-0.2.0.dist-info/top_level.txt,sha256=k6izISLxVoNnLxZHqnS3X-0eDdUD8LsV-OoM0afYdew,10 -agenteval/__init__.py,sha256=JQm11m01-rh2PjCw1OVqgy1rkU725Q6vMqfDtLbRH2U,1150 -agenteval/__pycache__/__init__.cpython-311.pyc,, -agenteval/__pycache__/cli.cpython-311.pyc,, -agenteval/__pycache__/conversation.cpython-311.pyc,, -agenteval/__pycache__/defaults.cpython-311.pyc,, -agenteval/__pycache__/hook.cpython-311.pyc,, -agenteval/__pycache__/plan.cpython-311.pyc,, -agenteval/__pycache__/target_response.cpython-311.pyc,, -agenteval/__pycache__/test.cpython-311.pyc,, -agenteval/__pycache__/test_result.cpython-311.pyc,, -agenteval/__pycache__/trace.cpython-311.pyc,, -agenteval/cli.py,sha256=wl0z_pCCKuu9lJgUWrS8cuHkvMYclhd-aCmCB6MN8u8,2807 -agenteval/conversation.py,sha256=r3fvnsnVI7zRoi_RS2JjPahUtLXF9vhnZYJcx1RMe3M,1030 -agenteval/defaults.py,sha256=PB1UniJ-uyiBn0WWSA3EI1UxcfpU2wlvsJZmhWgdV5E,280 -agenteval/evaluators/__init__.py,sha256=U6uQ6THgK0yxMnqVKL5l7_zUUxepoo11W1fPLa9xgNE,247 -agenteval/evaluators/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/__pycache__/base_evaluator.cpython-311.pyc,, -agenteval/evaluators/__pycache__/evaluator_factory.cpython-311.pyc,, -agenteval/evaluators/base_evaluator.py,sha256=zpWFBhQdaB-__TdiM7lFlkvQFX27KSFvzDFQ1KBvuLw,5052 -agenteval/evaluators/claude_3/__init__.py,sha256=mKv_FTRrhYIIS86zqxzj5edy-tKREHsn3nXUBmck71Q,180 -agenteval/evaluators/claude_3/__pycache__/__init__.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/evaluator.cpython-311.pyc,, -agenteval/evaluators/claude_3/__pycache__/model_configs.cpython-311.pyc,, -agenteval/evaluators/claude_3/evaluator.py,sha256=k-ZXtKBtywVYy1XEAkSufb9LYXlAElaklV8Wao-udLo,7751 -agenteval/evaluators/claude_3/model_configs.py,sha256=KUf0C5Bbgc-c05ZZlokVgjHVH4WGdoOfKtwQWwuQFLY,635 -agenteval/evaluators/evaluator_factory.py,sha256=JCTVoN62QNMcKR68KY2Li8zpm55HNvYwVBXZ0Yi3rhQ,712 -agenteval/hook.py,sha256=z8UfREnySi2E6tRwjeklI3CwjWQ5MMk59wLHj6TK9C0,1049 -agenteval/plan.py,sha256=tIXTXepcVZEA8JX0yoEzsSuLDVpqSYvBdKsGJYYCVbU,3236 -agenteval/runner/__init__.py,sha256=6f0fmworOJ0fn2MNzDg52zbip4osTovhwetT6ZQnI74,157 -agenteval/runner/__pycache__/__init__.cpython-311.pyc,, -agenteval/runner/__pycache__/runner.cpython-311.pyc,, -agenteval/runner/__pycache__/summary.cpython-311.pyc,, -agenteval/runner/runner.py,sha256=wSYcX82WTMwmMFCfqoHjxq8NTnV1_UdPr4A1fnmkD_U,3937 -agenteval/runner/summary.py,sha256=jTdFRFo7zAaE-PTA6Cy3n1cndgFB14vA20MDO9FeJyE,872 -agenteval/target_response.py,sha256=R_Gy-655vPEsSO7X2siU2GNiFPRl1CkRetiON8WYEGM,285 -agenteval/targets/__init__.py,sha256=JmGtuue6VQYkK5jAiArxlbnRQsA23p8NgDTMvnCWyGU,282 -agenteval/targets/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/__pycache__/base_target.cpython-311.pyc,, -agenteval/targets/__pycache__/boto3_target.cpython-311.pyc,, -agenteval/targets/__pycache__/target_factory.cpython-311.pyc,, -agenteval/targets/base_target.py,sha256=aYW5dLAlbKgscdf8XTcV9Bppbay-pz-c_y5RtCgdBD0,743 -agenteval/targets/bedrock_agent/__init__.py,sha256=2B5TCxdyQAXuQRtji0lclk5odB7xgT5Hi_dBwjErIzo,73 -agenteval/targets/bedrock_agent/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_agent/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_agent/target.py,sha256=GRfn4dOGkARF_3_DBupgoHrbiYQZADfqwXO65Z2-RDM,1332 -agenteval/targets/bedrock_knowledge_base/__init__.py,sha256=tYJixJ0x9ohkM7oker8eX7U4vkkxqV_xVlA4CsWIuec,89 -agenteval/targets/bedrock_knowledge_base/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/__pycache__/target.cpython-311.pyc,, -agenteval/targets/bedrock_knowledge_base/target.py,sha256=jOsAixfOSy6jEQF6p_uCwDLP7M1WB64F6K49CbtiSYc,1401 -agenteval/targets/boto3_target.py,sha256=qNukrm2GZOrG81pJc61BrJEFcNB_f80cvvWQyMFRQiA,1271 -agenteval/targets/q_business/__init__.py,sha256=1KT5BdoA_KD2fX3gNLvSyg9K5x0OfWBN8X15nxJf13U,67 -agenteval/targets/q_business/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/q_business/__pycache__/target.cpython-311.pyc,, -agenteval/targets/q_business/target.py,sha256=Bv9YiXcnBoUmXFN3nfCh2FNLNP9vMm_1ruWVlDGsXXs,1014 -agenteval/targets/sagemaker_endpoint/__init__.py,sha256=whoMO69GOhPMNOrbQAfYzVmIXuxhxt8dHJGABnR4_Ck,83 -agenteval/targets/sagemaker_endpoint/__pycache__/__init__.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/__pycache__/target.cpython-311.pyc,, -agenteval/targets/sagemaker_endpoint/target.py,sha256=zLsgkOljavYzrjrVnY3qDOjc-zsKFPSIdqugsZZy6po,2677 -agenteval/targets/target_factory.py,sha256=W8mzSy3E44jpYJs6XLD2WaLAaXXZ_T_WGw49CyPLigQ,1092 -agenteval/templates/evaluators/claude_3/generate_evaluation.jinja,sha256=aaTBZnr-3J29SpdernWW8bmQzF7lV0-bed1glZk36Yk,287 -agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja,sha256=wIhfhNUsTVdeIDBJNH1QWIBQWVE8h0Lc958vuuNU_eE,43 -agenteval/templates/evaluators/claude_3/generate_test_status.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/generate_user_response.jinja,sha256=2T9HuihEVtGvq-ncxl6hLrTZXi2wAYu3cQhCUl0F_qY,238 -agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja,sha256=3ihPICeDofWljtl6YpUJQM-lJSPNeWjhjgGndKM1wYQ,554 -agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja,sha256=DR1UaUvn0u_8MD0cSHAWSPLfEIwnGCKlEFPkuUAKLDQ,566 -agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja,sha256=akAKahEda6A3-XhVjXpacGR3e48HrbqE4UT4ONlqVZg,587 -agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja,sha256=yCy-IkJRM2y9-pPbaZaNrT-_4J7x9YM9kMgMXeYf5D4,800 -agenteval/templates/summary/agenteval_summary.md.jinja,sha256=Ri9B_lIpewlBtvs0ggj4IO9FbIZlMq70aDBZg_-xfQk,1107 -agenteval/test.py,sha256=mMbZWI5Yv6oQDS4xh5gCUvAj_IOih3vurqsMJs_9KbM,806 -agenteval/test_result.py,sha256=pDdXfrhIQtgO3au0XaxNLY1uql-POqZrlgu2vtNa0fc,738 -agenteval/trace.py,sha256=9JhT1i295AbKk1Zaj7Qa9EiXW1IJu-GsbOZ1hs8kiEU,2090 -agenteval/utils/__init__.py,sha256=xgJ0V8V34ju5tDEaX-WDBwXLTwMjFBztdYJ5lk2Y-OE,230 -agenteval/utils/__pycache__/__init__.cpython-311.pyc,, -agenteval/utils/__pycache__/aws.cpython-311.pyc,, -agenteval/utils/__pycache__/imports.cpython-311.pyc,, -agenteval/utils/aws.py,sha256=z6YjWUK1MhMl0Z6J-vxZiRBaHv8d444avFxEMjicq0c,1115 -agenteval/utils/imports.py,sha256=i-cd9Ze6LWeaBktGHgZkWLa6W_iUa11vTOBc5CQrfzA,1106 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL deleted file mode 100644 index bab98d6..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.43.0) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt deleted file mode 100644 index 6919bf1..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/entry_points.txt +++ /dev/null @@ -1,2 +0,0 @@ -[console_scripts] -agenteval = agenteval.cli:cli diff --git a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt b/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt deleted file mode 100644 index 060c7ea..0000000 --- a/stepfunctions/stepfunctions/agent_evaluation-0.2.0.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -agenteval diff --git a/stepfunctions/stepfunctions/agenteval/__init__.py b/stepfunctions/stepfunctions/agenteval/__init__.py deleted file mode 100644 index cd7bf51..0000000 --- a/stepfunctions/stepfunctions/agenteval/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib.metadata import version - -import logging -import os - -from jinja2 import Environment, PackageLoader, select_autoescape -from rich.logging import RichHandler - -from .hook import Hook -from .target_response import TargetResponse - -__all__ = ["Hook", "TargetResponse"] -__version__ = version("agent-evaluation") - - -_LOG_LEVEL_ENV = "LOG_LEVEL" - - -def configure_logger(): - # supress logs from botocore - logging.getLogger("botocore").setLevel(logging.CRITICAL) - - # configure logging using rich - formatter = logging.Formatter("%(message)s", datefmt="[%X]") - handler = RichHandler(markup=True, show_level=True, rich_tracebacks=True) - handler.setFormatter(formatter) - - logger = logging.getLogger(__name__) - - logger.setLevel(os.environ.get(_LOG_LEVEL_ENV, logging.INFO)) - logger.addHandler(handler) - - -configure_logger() - -jinja_env = Environment( - loader=PackageLoader(__name__), - autoescape=select_autoescape( - disabled_extensions=["jinja"], - default_for_string=True, - default=True, - ), -) diff --git a/stepfunctions/stepfunctions/agenteval/cli.py b/stepfunctions/stepfunctions/agenteval/cli.py deleted file mode 100644 index 940f621..0000000 --- a/stepfunctions/stepfunctions/agenteval/cli.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os -from typing import Optional - -import click - -from agenteval.plan import Plan -from agenteval.runner import Runner - -logger = logging.getLogger(__name__) - - -def validate_directory(directory): - if not os.path.isdir(directory): - raise NotADirectoryError(f"{directory} is not a directory") - if not os.access(directory, os.R_OK) or not os.access(directory, os.W_OK): - raise PermissionError(f"No read/write permissions for {directory}") - - -@click.group() -def cli(): - pass - - -@cli.command(help="Initialize a test plan.") -@click.option( - "--plan-dir", - type=str, - required=False, - help="The destination directory for storing the test plan. If unspecified, then the test plan is saved to the current working directory.", -) -def init(plan_dir: Optional[str]): - if plan_dir: - validate_directory(plan_dir) - try: - path = Plan.init_plan(plan_dir) - logger.info(f"[green]Test plan created at {path}") - - except FileExistsError as e: - logger.error(f"[red]{e}") - exit(1) - - -@cli.command(help="Run test plan.") -@click.option( - "--filter", - type=str, - required=False, - help="Specifies the test(s) to run. Multiple tests should be seperated using a comma. If unspecified, all tests from the test plan will be run.", -) -@click.option( - "--plan-dir", - type=str, - required=False, - help="The directory where the test plan is stored. If unspecified, then the current working directory is used.", -) -@click.option( - "--verbose", - is_flag=True, - type=bool, - default=False, - help="Controls the verbosity of the terminal logs.", -) -@click.option( - "--num-threads", - type=int, - required=False, - help="Number of threads (and thus tests) to run concurrently. If unspecified, number of threads will be capped at 45.", -) -@click.option( - "--work-dir", - type=str, - required=False, - help="The directory where the test result and trace will be generated. If unspecified, then the current working directory is used.", -) -def run( - filter: Optional[str], - plan_dir: Optional[str], - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], -): - try: - plan = Plan.load(plan_dir, filter) - if work_dir: - validate_directory(work_dir) - runner = Runner( - plan, - verbose, - num_threads, - work_dir, - ) - num_failed = runner.run() - _num_failed_exit(num_failed) - - except Exception as e: - _exception_exit(e) - - -def _num_failed_exit(num_failed): - exit(1 if num_failed else 0) - - -def _exception_exit(e): - logger.exception(f"Error running test: {e}") - exit(1) diff --git a/stepfunctions/stepfunctions/agenteval/conversation.py b/stepfunctions/stepfunctions/agenteval/conversation.py deleted file mode 100644 index 59e4304..0000000 --- a/stepfunctions/stepfunctions/agenteval/conversation.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -_USER = "USER" -_AGENT = "AGENT" -_START_TURN_COUNT = 0 - - -class Conversation: - """Captures the interaction between a user and an agent. - - Attributes: - messages (list): A list of tuples of the form (role, message). - turns (int): The number of turns in the conversation. - """ - - def __init__(self): - self.messages = [] - self.turns = _START_TURN_COUNT - - def __iter__(self): - """Allow iteration over conversation messages.""" - return iter(self.messages) - - def add_turn(self, user_message: str, agent_response: str): - """Record a turn in the conversation. - - Args: - user_message (str): The users's message - agent_response (str): The agent's response to the user's message - - Increments the `turn` counter by `1`. - """ - self.messages.extend([(_USER, user_message), (_AGENT, agent_response)]) - self.turns += 1 diff --git a/stepfunctions/stepfunctions/agenteval/defaults.py b/stepfunctions/stepfunctions/agenteval/defaults.py deleted file mode 100644 index 929c675..0000000 --- a/stepfunctions/stepfunctions/agenteval/defaults.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MAX_TURNS = 2 - -# Default max number of threads not exceeding Bedrock service quota: -# https://docs.aws.amazon.com/bedrock/latest/userguide/quotas.html -MAX_NUM_THREADS = 45 diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py deleted file mode 100644 index 8e52702..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_evaluator import BaseEvaluator -from .evaluator_factory import EvaluatorFactory - -__all__ = ["BaseEvaluator", "EvaluatorFactory"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py deleted file mode 100644 index e1bd4c9..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/base_evaluator.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -from abc import ABC, abstractmethod -from typing import Optional - -from agenteval.conversation import Conversation -from agenteval.hook import Hook -from agenteval.targets import BaseTarget -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace -from agenteval.utils import create_boto3_client, import_class - -_DEFAULT_MAX_RETRY = 10 -_BOTO3_SERVICE_NAME = "bedrock-runtime" - - -class BaseEvaluator(ABC): - """The `BaseEvaluator` abstract base class defines the common interface for evaluator - classes. - - Attributes: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - conversation (Conversation): Captures the interaction between a user and an agent. - trace (Trace): Captures steps during evaluation. - test_result (TestResult): The result of the test which is set in `BaseEvaluator.run`. - input_token_count (int): Number of input tokens processed by the evaluator. - output_token_count (int): Number of output tokens generated by the evaluator. - model_id (str): The ID of the Bedrock model used to run evaluation. If `provisioned_throughput_arn` is provided, - then this will be set to the ARN of the provisioned throughput. - boto3_client (BaseClient): A `boto3` client representing Amazon Bedrock Runtime. - """ - - def __init__( - self, - test: Test, - target: BaseTarget, - work_dir: str, - model_id: str, - provisioned_throughput_arn: Optional[str] = None, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """Initialize the evaluator instance for a given `Test` and `Target`. - - Args: - test (Test): The test case. - target (BaseTarget): The target agent being evaluated. - work_dir (str): The work directory. - model_id (str): The ID of the Bedrock model used to run evaluation. - provisioned_throughput_arn (str, optional): The ARN of the provisioned throughput. - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - self.test = test - self.target = target - self.conversation = Conversation() - self.trace = Trace(work_dir=work_dir, test_name=test.name) - self.test_result = None - self.input_token_count = 0 - self.output_token_count = 0 - self.model_id = provisioned_throughput_arn or model_id - self.bedrock_runtime_client = create_boto3_client( - boto3_service_name=_BOTO3_SERVICE_NAME, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) - - @abstractmethod - def evaluate(self) -> TestResult: - """Conduct a test. - - Returns: - TestResult: The result of the test. - """ - pass - - def _get_hook_cls(self, hook: Optional[str]) -> Optional[type[Hook]]: - if hook: - hook_cls = import_class(hook, parent_class=Hook) - return hook_cls - - def invoke_model(self, request_body: dict) -> dict: - """ - Invoke the Bedrock model using the `boto3_client`. This method will convert - a request dictionary to a JSON string before passing it to the `InvokeModel` API. - - Refer to the `boto3` documentation for more details. - - Args: - request_body (dict): The request payload as a dictionary. - - Returns: - dict: The response from the model invocation. - - """ - response = self.bedrock_runtime_client.invoke_model( - modelId=self.model_id, body=json.dumps(request_body) - ) - - self._incr_token_counts(response) - - return response - - def _incr_token_counts(self, response: dict): - headers = response["ResponseMetadata"]["HTTPHeaders"] - - self.input_token_count += int( - headers.get("x-amzn-bedrock-input-token-count", 0) - ) - self.output_token_count += int( - headers.get("x-amzn-bedrock-output-token-count", 0) - ) - - def run(self) -> TestResult: - """ - Run the evaluator within a trace context manager and run hooks - if provided. - """ - - hook_cls = self._get_hook_cls(self.test.hook) - - with self.trace: - if hook_cls: - hook_cls.pre_evaluate(self.test, self.trace) - self.test_result = self.evaluate() - if hook_cls: - hook_cls.post_evaluate(self.test, self.test_result, self.trace) - - return self.test_result diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py deleted file mode 100644 index 338be7d..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .evaluator import Claude3Evaluator - -__all__ = ["Claude3Evaluator"] diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py deleted file mode 100644 index cc8b3ae..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/evaluator.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import json -import logging -import os -import re -from typing import Tuple - -from agenteval import jinja_env -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import model_configs -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_PROMPT_TEMPLATE_ROOT = "evaluators/claude_3" -_SYSTEM_PROMPT_DIR = "system" -_PROMPT_TEMPLATE_NAMES = [ - "generate_initial_prompt", - "generate_user_response", - "generate_test_status", - "generate_evaluation", -] - -# enable backwards-compatible StrEnum -try: - from enum import StrEnum -except ImportError: - from enum import Enum - - class StrEnum(str, Enum): - pass - - -class TestStatusCategories(StrEnum): - ALL_STEPS_ATTEMPTED = "A" - NOT_ALL_STEPS_ATTEMPTED = "B" - - -class EvaluationCategories(StrEnum): - ALL_EXPECTED_RESULTS_OBSERVED = "A" - NOT_ALL_EXPECTED_RESULTS_OBSERVED = "B" - - -class Results(StrEnum): - MAX_TURNS_REACHED = "Maximum turns reached." - ALL_EXPECTED_RESULTS_OBSERVED = ( - "All of the expected results can be observed in the conversation." - ) - NOT_ALL_EXPECTED_RESULTS_OBSERVED = ( - "Not all of the expected results can be observed in the conversation." - ) - - -class Claude3Evaluator(BaseEvaluator): - def __init__( - self, - **kwargs, - ): - super().__init__(model_id=model_configs.MODEL_ID, **kwargs) - - self._prompt_template_map = { - name: { - "system": jinja_env.get_template( - os.path.join( - _PROMPT_TEMPLATE_ROOT, _SYSTEM_PROMPT_DIR, f"{name}.jinja" - ) - ), - "prompt": jinja_env.get_template( - os.path.join(_PROMPT_TEMPLATE_ROOT, f"{name}.jinja") - ), - } - for name in _PROMPT_TEMPLATE_NAMES - } - - @staticmethod - def _extract_content_from_xml(xml_data: str, element_names: list[str]) -> Tuple: - content = [] - for e in element_names: - pattern = rf"<{e}>(.*?)" - match = re.search(pattern, xml_data, re.DOTALL) - content.append(match.group(1).strip() if match else None) - return tuple(content) - - def _generate( - self, - system_prompt: str, - prompt: str, - output_xml_element: str, - ) -> str: - request_body = model_configs.REQUEST_BODY - request_body["system"] = system_prompt - request_body["messages"][0]["content"][0]["text"] = prompt - - response = self.invoke_model(request_body=request_body) - response_body = response.get("body").read() - completion = json.loads(response_body)["content"][0]["text"] - - logger.debug( - f"[{self.test.name}]\n[PROMPT]\n{prompt}\n[COMPLETION]\n{completion}" - ) - - output, reasoning = self._extract_content_from_xml( - completion, [output_xml_element, "thinking"] - ) - - return output, reasoning - - def _generate_initial_prompt(self) -> str: - system_prompt = self._prompt_template_map["generate_initial_prompt"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_initial_prompt"]["prompt"].render( - step=self.test.steps[0] - ) - - initial_prompt, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="initial_prompt", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - initial_prompt=initial_prompt, - reasoning=reasoning, - ) - return initial_prompt - - def _generate_test_status(self) -> str: - system_prompt = self._prompt_template_map["generate_test_status"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_test_status"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - test_status, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - test_status=test_status, - reasoning=reasoning, - ) - return test_status - - def _generate_evaluation(self) -> tuple[str, str]: - system_prompt = self._prompt_template_map["generate_evaluation"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_evaluation"]["prompt"].render( - expected_results=self.test.expected_results, - conversation=self.conversation, - ) - - evaluation, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="category", - ) - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - evaluation=evaluation, - reasoning=reasoning, - ) - - return evaluation, reasoning - - def _generate_user_response(self) -> str: - system_prompt = self._prompt_template_map["generate_user_response"][ - "system" - ].render() - prompt = self._prompt_template_map["generate_user_response"]["prompt"].render( - steps=self.test.steps, conversation=self.conversation - ) - - user_response, reasoning = self._generate( - system_prompt=system_prompt, - prompt=prompt, - output_xml_element="user_response", - ) - - self.trace.add_step( - system_prompt=system_prompt, - prompt=prompt, - user_response=user_response, - reasoning=reasoning, - ) - return user_response - - def _invoke_target(self, user_input) -> str: - target_response = self.target.invoke(user_input) - self.trace.add_step(data=target_response.data) - - return target_response.response - - def evaluate(self) -> TestResult: - success = False - result = Results.MAX_TURNS_REACHED.value - reasoning = "" - - while self.conversation.turns < self.test.max_turns: - if self.conversation.turns == 0: - # start conversation - if self.test.initial_prompt: - user_input = self.test.initial_prompt - else: - user_input = self._generate_initial_prompt() - else: - # generate next user response - user_input = self._generate_user_response() - - # add turn to the conversation - self.conversation.add_turn(user_input, self._invoke_target(user_input)) - - # get test status - test_status = self._generate_test_status() - if test_status == TestStatusCategories.ALL_STEPS_ATTEMPTED: - # evaluate conversation - eval_category, reasoning = self._generate_evaluation() - if ( - eval_category - == EvaluationCategories.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value # noqa: W503 - ): - result = Results.NOT_ALL_EXPECTED_RESULTS_OBSERVED.value - else: - result = Results.ALL_EXPECTED_RESULTS_OBSERVED.value - success = True - - break - - return TestResult( - test_name=self.test.name, - success=success, - result=result, - reasoning=reasoning, - conversation=self.conversation, - ) diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py deleted file mode 100644 index e6bc2fc..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/model_configs.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -MODEL_ID = "anthropic.claude-3-sonnet-20240229-v1:0" -ANTHROPIC_VERSION = "bedrock-2023-05-31" -ROLE = "user" -MAX_TOKENS_TO_SAMPLE = 300 -TEMPERATURE = 0 -TOP_K = 250 -TOP_P = 1 -REQUEST_BODY = { - "anthropic_version": ANTHROPIC_VERSION, - "max_tokens": MAX_TOKENS_TO_SAMPLE, - "system": None, - "messages": [ - { - "role": ROLE, - "content": [ - {"type": "text", "text": None}, - ], - } - ], - "temperature": TEMPERATURE, - "top_p": TOP_P, - "top_k": TOP_K, -} diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt b/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt deleted file mode 100644 index fce3738..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/claude_3/most_updated_prompt_2607.txt +++ /dev/null @@ -1,67 +0,0 @@ - -You are an energy advisor with twenty years of experience at the UK's leading energy providers. You are empathetic and compassionate, you understand that rising energy prices can be a source of strain. You are pragmatic. Ask the user clarifying questions to understand their personal situation and to ensure you are giving personalised advice. Do not make information up, if you do not know how to answer be honest. Before answering, please think about all the information you would need before answering the user's question. - - - - - - - -You are a compassionate and empathetic customer-facing energy advisor with twenty years of experience at the UK's leading energy providers. You have the important role of preventing customers from debt or payment difficulties, whilst also providing tailored support to hose already struggling with energy costs. Most importantly, you assess each customer's unique needs and provide support that's tailored to their individual situation. - - - - -Your approach is to: -1) Create a profile of the customer by asking a few clarifying questions, one at a time, about their situation, energy usage and any challenges they are facing. -2) Based on their responses, provide a personalised recommendation to resolve their issue or improve their circumstance and ensure they are being energy efficient. - -Some example questions include: - - - -* Does the customer have a smart meter? -* Are they aware of Energy Hub? -* Are they on the right tariff? -* How many people are in their household? -* What is their current living situation (apartment, house, etc.)? - - - -Some examples of recommendations include: - - -* Smart meter installation for better usage monitoring -* Checking their eligibility for financial assistance including debt relief or the Warm Home Discount - - - -Always greet the customer with a salutation, even if they do not use one themselves. Approach each question with care. Do not make information up - if you do not know the answer - please be honest. Always remember to keep a conversational tone, especially when providing the recommendations. Ask the customer questions one at a time. Once you have enough information to provide the user with a helpful recommendation, then provide it. - - -Here is an example interaction: - - -A: how can I reduce my energy bill? - -B: Hi there, I understand you want to reduce your energy bill. I want to give you advice that is personal to your situation. So will ask some questions to understand you better. Is that okay? - -A: Yes - -B: What kind of house do you live in and with how many people? - -A: I live in a one-bedroom apartment with my partner? - -B: Thank you, and how do you measure your energy use? - -A: I send meter readings? - -B: Okay, so to confirm you don’t have a smart meter? - -A: No - -B: My first recommendation would be a smart meter. A smart meter is a way to ensure that your energy readings are always up to date and can assist with your payment if you are overpaying at some points in the year. Would you like some more recommendations? -... -[continues dialogue to gather more details if required and then provide a personalized recommendation] - - diff --git a/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py b/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py deleted file mode 100644 index d42f8e3..0000000 --- a/stepfunctions/stepfunctions/agenteval/evaluators/evaluator_factory.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - -from agenteval.evaluators import BaseEvaluator -from agenteval.evaluators.claude_3 import Claude3Evaluator -from agenteval.targets import BaseTarget -from agenteval.test import Test - -_EVALUATOR_MAP = { - "claude-3": Claude3Evaluator, -} - - -class EvaluatorFactory(BaseModel): - config: dict - - def create( - self, test: Test, target: BaseTarget, work_dir: Optional[str] - ) -> BaseEvaluator: - evaluator_cls = _EVALUATOR_MAP[self.config["model"]] - return evaluator_cls( - test=test, - target=target, - work_dir=work_dir, - **{k: v for k, v in self.config.items() if k != "model"} - ) diff --git a/stepfunctions/stepfunctions/agenteval/hook.py b/stepfunctions/stepfunctions/agenteval/hook.py deleted file mode 100644 index a1386e6..0000000 --- a/stepfunctions/stepfunctions/agenteval/hook.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from agenteval.test import Test -from agenteval.test_result import TestResult -from agenteval.trace import Trace - - -class Hook: - """An evaluation hook.""" - - def pre_evaluate(test: Test, trace: Trace) -> None: - """ - Method called before evaluation. Can be used to perform any setup tasks. - - Args: - test (Test): The test case. - trace (Trace): Captures steps during evaluation. - """ - pass - - def post_evaluate(test: Test, test_result: TestResult, trace: Trace) -> None: - """ - Method called after evaluation. This may be used to perform integration testing - or clean up tasks. - - Args: - test (Test): The test case. - test_result (TestResult): The result of the test, which can be overriden - by updating the attributes of this object. - trace (Trace): Captures steps during evaluation. - """ - pass diff --git a/stepfunctions/stepfunctions/agenteval/plan.py b/stepfunctions/stepfunctions/agenteval/plan.py deleted file mode 100644 index 73a3107..0000000 --- a/stepfunctions/stepfunctions/agenteval/plan.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import logging -import os -import sys -from typing import Optional - -import yaml -from pydantic import BaseModel, model_validator - -from agenteval import defaults -from agenteval.evaluators import EvaluatorFactory -from agenteval.targets import TargetFactory -from agenteval.test import Test - -_PLAN_FILE_NAME = "agenteval.yml" - -_INIT_PLAN = { - "evaluator": {"model": "claude-3"}, - "target": { - "type": "bedrock-agent", - "bedrock_agent_id": None, - "bedrock_agent_alias_id": None, - }, - "tests": { - "retrieve_missing_documents": { - "steps": ["Ask agent for a list of missing documents for claim-006."], - "expected_results": ["The agent returns a list of missing documents."], - } - }, -} - - -sys.path.append(".") -logger = logging.getLogger(__name__) - - -class Plan(BaseModel, validate_assignment=True, arbitrary_types_allowed=True): - evaluator_factory: EvaluatorFactory - target_factory: TargetFactory - tests: list[Test] - - @model_validator(mode="after") - def check_test_names_unique(self) -> Plan: - unique_names = len(set(test.name for test in self.tests)) - - if unique_names != len(self.tests): - raise ValueError("Test names must be unique") - - return self - - @classmethod - def load(cls, plan_dir: Optional[str], filter: str) -> Plan: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - plan = cls._load_yaml(plan_path) - - return cls( - evaluator_factory=EvaluatorFactory(config=plan["evaluator"]), - target_factory=TargetFactory(config=plan["target"]), - tests=cls._load_tests(plan["tests"], filter), - ) - - @staticmethod - def _load_yaml(path: str) -> dict: - with open(path) as stream: - return yaml.safe_load(stream) - - @staticmethod - def _load_tests(test_config: list[dict], filter: str) -> list[Test]: - tests = [] - - if filter: - names = Plan._parse_filter(filter) - else: - names = test_config.keys() - - for name in names: - config = test_config[name] - tests.append( - Test( - name=name, - steps=config["steps"], - expected_results=config["expected_results"], - initial_prompt=config.get("initial_prompt"), - max_turns=config.get("max_turns", defaults.MAX_TURNS), - hook=config.get("hook"), - ) - ) - - return tests - - @staticmethod - def _parse_filter(filter: str) -> list[str]: - return [n.strip() for n in filter.split(",")] - - @staticmethod - def init_plan(plan_dir: Optional[str]) -> str: - plan_path = os.path.join(plan_dir or os.getcwd(), _PLAN_FILE_NAME) - - # check if plan exists - if os.path.exists(plan_path): - raise FileExistsError(f"Test plan already exists at {plan_path}") - - with open(plan_path, "w") as stream: - yaml.safe_dump(_INIT_PLAN, stream, sort_keys=False) - - return plan_path diff --git a/stepfunctions/stepfunctions/agenteval/runner/__init__.py b/stepfunctions/stepfunctions/agenteval/runner/__init__.py deleted file mode 100644 index 32377b3..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .runner import Runner - -__all__ = ["Runner"] diff --git a/stepfunctions/stepfunctions/agenteval/runner/runner.py b/stepfunctions/stepfunctions/agenteval/runner/runner.py deleted file mode 100644 index c3e0803..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/runner.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import concurrent.futures -import logging -import os -import time -from typing import Optional - -from rich.progress import Progress - -from agenteval.defaults import MAX_NUM_THREADS -from agenteval.plan import Plan -from agenteval.runner.summary import create_markdown_summary - -logger = logging.getLogger(__name__) - - -class Runner: - def __init__( - self, - plan: Plan, - verbose: bool, - num_threads: Optional[int], - work_dir: Optional[str], - ): - self.plan = plan - self.work_dir = work_dir if work_dir else os.getcwd() - self.num_tests = len(self.plan.tests) - self.verbose = verbose - self.num_threads = num_threads - if not self.num_threads: - self.num_threads = min(self.num_tests, MAX_NUM_THREADS) - self.results = {test.name: None for test in self.plan.tests} - self.num_failed = 0 - self.evaluator_input_token_counts = [] - self.evaluator_output_token_counts = [] - - def run(self) -> int: - self._log_run_start() - - self.start_time = time.time() - with Progress(transient=True) as self.progress: - self.tracker = self.progress.add_task("running...", total=self.num_tests) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=self.num_tests - ) as executor: - futures = [ - executor.submit(self.run_test, test) for test in self.plan.tests - ] - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - raise e - - self._log_run_end() - - create_markdown_summary( - self.work_dir, self.plan.tests, list(self.results.values()), self.verbose - ) - - return self.num_failed - - def run_test(self, test): - target = self.plan.target_factory.create() - evaluator = self.plan.evaluator_factory.create( - test=test, - target=target, - work_dir=self.work_dir, - ) - - result = evaluator.run() - if result.success is False: - self.num_failed += 1 - - self.progress.update(self.tracker, advance=1) - self.results[test.name] = result - self.evaluator_input_token_counts.append(evaluator.input_token_count) - self.evaluator_output_token_counts.append(evaluator.output_token_count) - - def _log_run_start(self): - logger.info(f"Starting {self.num_tests} tests with {self.num_threads} threads.") - - def _log_run_end(self): - self._log_pass_fail_count() - logger.info(f"Completed in {round(time.time() - self.start_time, 2)} seconds.") - if self.verbose: - self._log_test_result() - self._log_evaluator_token_io() - - def _log_test_result(self): - for _, result in self.results.items(): - logger_func = logger.info if result.success else logger.error - logger_func( - f"[bold {'green' if result.success else 'red'}]{result.test_name}...{'PASSED' if result.success else 'FAILED'}", - ) - - def _log_pass_fail_count(self): - passed_count = self.num_tests - self.num_failed - status_str = ( - f"[red]{passed_count} passed, {self.num_failed} failed." - if self.num_failed - else f"[green]{self.num_tests} passed." - ) - logger_func = logger.error if self.num_failed else logger.info - logger_func(status_str) - - def _log_evaluator_token_io(self): - logger.info( - f"Input tokens processed by evaluator: {sum(self.evaluator_input_token_counts)}" - ) - logger.info( - f"Output tokens generated by evaluator: {sum(self.evaluator_output_token_counts)}" - ) diff --git a/stepfunctions/stepfunctions/agenteval/runner/summary.py b/stepfunctions/stepfunctions/agenteval/runner/summary.py deleted file mode 100644 index 1abfaad..0000000 --- a/stepfunctions/stepfunctions/agenteval/runner/summary.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import logging -import os - -from agenteval import jinja_env -from agenteval.test import Test -from agenteval.test_result import TestResult - -logger = logging.getLogger(__name__) - -_TEMPLATE_ROOT = "summary" -_TEMPLATE_FILE_NAME = "agenteval_summary.md.jinja" - - -def create_markdown_summary( - work_dir: str, tests: list[Test], test_results: list[TestResult], verbose: bool -): - template = jinja_env.get_template(os.path.join(_TEMPLATE_ROOT, _TEMPLATE_FILE_NAME)) - - summary_path = os.path.join(work_dir, os.path.splitext(_TEMPLATE_FILE_NAME)[0]) - - rendered = template.render(tests=tests, results=test_results, zip=zip) - - with open(summary_path, "w+") as f: - f.write(rendered) - - if verbose: - logger.info(f"Summary available at {summary_path}") diff --git a/stepfunctions/stepfunctions/agenteval/target_response.py b/stepfunctions/stepfunctions/agenteval/target_response.py deleted file mode 100644 index 417543f..0000000 --- a/stepfunctions/stepfunctions/agenteval/target_response.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Optional - -from pydantic import BaseModel - - -class TargetResponse(BaseModel): - """A target's response. - - Attributes: - response: The response string. - data: Additional data (if applicable). - """ - - response: str - data: Optional[dict] = None diff --git a/stepfunctions/stepfunctions/agenteval/targets/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/__init__.py deleted file mode 100644 index 910e303..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .base_target import BaseTarget -from .boto3_target import Boto3Target -from .target_factory import TargetFactory - -__all__ = ["BaseTarget", "TargetFactory", "Boto3Target"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/base_target.py b/stepfunctions/stepfunctions/agenteval/targets/base_target.py deleted file mode 100644 index f8fbaa8..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/base_target.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from abc import ABC, abstractmethod - -from agenteval import TargetResponse - - -class BaseTarget(ABC): - """The `BaseTarget` abstract base class defines the common interface for target - classes. - """ - - @abstractmethod - def invoke(self, prompt: str) -> TargetResponse: - """Invoke the target with a prompt and return a response as a string. - - Args: - prompt: The prompt string to pass to the target. - - Returns: - A TargetResponse object containing the target's response string and - any trace data (if applicable). - """ - pass diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py deleted file mode 100644 index 4d393ff..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockAgentTarget - -__all__ = ["BedrockAgentTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py deleted file mode 100644 index f7e6f9c..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_agent/target.py +++ /dev/null @@ -1,41 +0,0 @@ -import uuid - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockAgentTarget(Boto3Target): - def __init__(self, bedrock_agent_id: str, bedrock_agent_alias_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - self._bedrock_agent_id = bedrock_agent_id - self._bedrock_agent_alias_id = bedrock_agent_alias_id - self._session_id: str = str(uuid.uuid4()) - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "agentId": self._bedrock_agent_id, - "agentAliasId": self._bedrock_agent_alias_id, - "sessionId": self._session_id, - "inputText": prompt, - "enableTrace": True, - } - - response = self.boto3_client.invoke_agent(**args) - - stream = response["completion"] - completion = "" - trace_data = [] - - for event in stream: - chunk = event.get("chunk") - event_trace = event.get("trace") - if chunk: - completion += chunk.get("bytes").decode() - if event_trace: - trace_data.append(event_trace.get("trace")) - - return TargetResponse( - response=completion, data={"bedrock_agent_trace": trace_data} - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py deleted file mode 100644 index d56ea6f..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import BedrockKnowledgeBaseTarget - -__all__ = ["BedrockKnowledgeBaseTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py b/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py deleted file mode 100644 index a9491e2..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/bedrock_knowledge_base/target.py +++ /dev/null @@ -1,38 +0,0 @@ -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "bedrock-agent-runtime" - - -class BedrockKnowledgeBaseTarget(Boto3Target): - def __init__(self, knowledge_base_id: str, model_id: str, **kwargs): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - aws_region = self.boto3_client.meta.region_name - self._knowledge_base_id = knowledge_base_id - self._model_arn = f"arn:aws:bedrock:{aws_region}::foundation-model/{model_id}" - self._session_id: str = None - - def invoke(self, prompt: str) -> TargetResponse: - args = { - "input": { - "text": prompt, - }, - "retrieveAndGenerateConfiguration": { - "type": "KNOWLEDGE_BASE", - "knowledgeBaseConfiguration": { - "knowledgeBaseId": self._knowledge_base_id, - "modelArn": self._model_arn, - }, - }, - } - if self._session_id: - args["sessionId"] = self._session_id - - response = self.boto3_client.retrieve_and_generate(**args) - generated_text = response["output"]["text"] - citations = response["citations"] - self._session_id = response["sessionId"] - - return TargetResponse( - response=generated_text, data={"bedrock_knowledgebase_citations": citations} - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py b/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py deleted file mode 100644 index e47e8cb..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/boto3_target.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Optional - -from agenteval.targets import BaseTarget -from agenteval.utils import create_boto3_client - -_DEFAULT_MAX_RETRY = 10 - - -class Boto3Target(BaseTarget): - """A target that can be interfaced with via the `boto3` library. - - Attributes: - boto3_client (BaseClient): A `boto3` client. - """ - - def __init__( - self, - boto3_service_name: str, - aws_profile: Optional[str] = None, - aws_region: Optional[str] = None, - endpoint_url: Optional[str] = None, - max_retry: int = _DEFAULT_MAX_RETRY, - ): - """ - Initialize the AWS target. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-agent-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - """ - - self.boto3_client = create_boto3_client( - boto3_service_name=boto3_service_name, - aws_profile=aws_profile, - aws_region=aws_region, - endpoint_url=endpoint_url, - max_retry=max_retry, - ) diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py deleted file mode 100644 index 3f621e5..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/q_business/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import QBusinessTarget - -__all__ = ["QBusinessTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py b/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py deleted file mode 100644 index 8fd59be..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/q_business/target.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Optional - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "qbusiness" - - -class QBusinessTarget(Boto3Target): - def __init__( - self, - q_business_application_id: str, - q_business_user_id: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._chat_sync_args = {"applicationId": q_business_application_id} - if q_business_user_id: - self._chat_sync_args["userId"] = q_business_user_id - - def invoke(self, prompt: str) -> str: - self._chat_sync_args["userMessage"] = prompt - - response = self.boto3_client.chat_sync(**self._chat_sync_args) - - if "conversationId" not in self._chat_sync_args: - self._chat_sync_args["conversationId"] = response["conversationId"] - - self._chat_sync_args["parentMessageId"] = response["systemMessageId"] - - return TargetResponse(response=response["systemMessage"]) diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py deleted file mode 100644 index 8c9adc2..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .target import SageMakerEndpointTarget - -__all__ = ["SageMakerEndpointTarget"] diff --git a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py b/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py deleted file mode 100644 index 74d2056..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/sagemaker_endpoint/target.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -from typing import Optional - -from jsonpath_ng import parse - -from agenteval import TargetResponse -from agenteval.targets import Boto3Target - -_SERVICE_NAME = "sagemaker-runtime" -_CONTENT_TYPE = "application/json" -_ACCEPT = "application/json" - - -class SageMakerEndpointTarget(Boto3Target): - def __init__( - self, - endpoint_name: str, - request_body: dict, - input_path: str, - output_path: str, - custom_attributes: Optional[str] = None, - target_model: Optional[str] = None, - target_variant: Optional[str] = None, - target_container_hostname: Optional[str] = None, - inference_component_name: Optional[str] = None, - **kwargs - ): - super().__init__(boto3_service_name=_SERVICE_NAME, **kwargs) - - self._request_body = request_body - self._input_jp_expr = parse(input_path) - self._output_jp_expr = parse(output_path) - - self._args = self._create_base_args( - endpoint_name, - custom_attributes, - target_model, - target_variant, - target_container_hostname, - inference_component_name, - ) - - @staticmethod - def _create_base_args( - endpoint_name: str, - custom_attributes: Optional[str], - target_model: Optional[str], - target_variant: Optional[str], - target_container_hostname: Optional[str], - inference_component_name: Optional[str], - ): - args = { - "EndpointName": endpoint_name, - "ContentType": _CONTENT_TYPE, - "Accept": _ACCEPT, - **{ - key: value - for key, value in { - "CustomAttributes": custom_attributes, - "TargetModel": target_model, - "TargetVariant": target_variant, - "TargetContainerHostname": target_container_hostname, - "InferenceComponentName": inference_component_name, - }.items() - if value is not None - }, - } - - return args - - def _update_request(self, prompt: str): - self._input_jp_expr.update(self._request_body, prompt) - self._args["Body"] = json.dumps(self._request_body) - - def _query_response(self, response_body: dict) -> str: - return self._output_jp_expr.find(response_body)[0].value - - def invoke(self, prompt: str) -> str: - self._update_request(prompt) - - response = self.boto3_client.invoke_endpoint(**self._args) - - response_body = json.loads(response.get("Body").read()) - - return TargetResponse(response=self._query_response(response_body)) diff --git a/stepfunctions/stepfunctions/agenteval/targets/target_factory.py b/stepfunctions/stepfunctions/agenteval/targets/target_factory.py deleted file mode 100644 index a8e7e9c..0000000 --- a/stepfunctions/stepfunctions/agenteval/targets/target_factory.py +++ /dev/null @@ -1,32 +0,0 @@ -from pydantic import BaseModel - -from agenteval.targets import BaseTarget -from agenteval.targets.bedrock_agent import BedrockAgentTarget -from agenteval.targets.bedrock_knowledge_base import BedrockKnowledgeBaseTarget -from agenteval.targets.q_business import QBusinessTarget -from agenteval.targets.sagemaker_endpoint import SageMakerEndpointTarget -from agenteval.utils import import_class - -_TARGET_MAP = { - "bedrock-agent": BedrockAgentTarget, - "q-business": QBusinessTarget, - "sagemaker-endpoint": SageMakerEndpointTarget, - "bedrock-knowledgebase": BedrockKnowledgeBaseTarget, -} - - -class TargetFactory(BaseModel): - config: dict - - def create(self) -> BaseTarget: - target_cls = self._get_target_class() - - return target_cls(**{k: v for k, v in self.config.items() if k != "type"}) - - def _get_target_class(self) -> type[BaseTarget]: - if self.config["type"] in _TARGET_MAP: - target_cls = _TARGET_MAP[self.config["type"]] - else: - target_cls = import_class(self.config["type"], parent_class=BaseTarget) - - return target_cls diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja deleted file mode 100644 index 9cd9dd4..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_evaluation.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the expected results and conversation: - - -{% for result in expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja deleted file mode 100644 index 832ba37..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_initial_prompt.jinja +++ /dev/null @@ -1,5 +0,0 @@ -Here is the step: - - -{{ step }} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja deleted file mode 100644 index 79ad0df..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/generate_user_response.jinja +++ /dev/null @@ -1,13 +0,0 @@ -Here are the steps and conversation: - - -{% for step in steps -%} -{{ loop.index }}. {{ step }} -{% endfor -%} - - - -{% for sender, message in conversation -%} -{{ sender }}: {{ message }} -{% endfor -%} - \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja deleted file mode 100644 index 22cace3..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_evaluation.jinja +++ /dev/null @@ -1,12 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -Your job is to analyze the conversation in tags and a list of expected results -in tags. - -You will classify the the conversation into the following categories: - -- A: All of the expected results can be observed in the conversation. -- B: Not all of the expected results can be observed in the conversation. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja deleted file mode 100644 index d0e8e23..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_initial_prompt.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given a step that is wrapped in tags. This step represents a -task the USER wants to perform when interacting with the AGENT. - -Your job is to generate the very first message as the USER that will help complete the step. - -Make sure this message is concise and to the point. - -Do not provide any information if it is expected that the AGENT will eventually ask for it. - -Please think hard about the response in tags before providing the message -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja deleted file mode 100644 index 7bb8e6b..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_test_status.jinja +++ /dev/null @@ -1,13 +0,0 @@ -You are a quality assurance engineer evaluating a conversation between an USER and an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents a task -that the USER wants to perform when interacting with the AGENT. - -Your job is analyze the running conversation in tags and classify it into the following -categories: - -- A: The USER has attempted all the steps. -- B: The USER has not yet attempted all the steps. - -Please think hard about the response in tags before providing only the category letter -within tags. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja b/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja deleted file mode 100644 index e670420..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/evaluators/claude_3/system/generate_user_response.jinja +++ /dev/null @@ -1,15 +0,0 @@ -You are role playing as an USER in a conversastion with an AGENT. - -You will be given an ordered list of steps wrapped in tags. Each step represents -a task that the USER wants to perform when interacting with the AGENT. - -Using the list of steps, your job is analyze the running conversation in the - tags and generate the next appropriate response as the USER. - -Do not include any information from a step unless the AGENT asks for it. - -If the AGENT was unable to help or did not understand the last request, just move on to -the next step. Do not attempt to rephrase the request in the next response as the USER. - -Please think hard about the response in tags before providing the response -within tags. Do not include the string "USER:" in your response. \ No newline at end of file diff --git a/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja b/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja deleted file mode 100644 index a624303..0000000 --- a/stepfunctions/stepfunctions/agenteval/templates/summary/agenteval_summary.md.jinja +++ /dev/null @@ -1,49 +0,0 @@ -# Test Summary ---- -This document provides a summary of the tests executed by Agent Evaluation. - -> :warning: This tool utilizes generative AI to assess virtual agents and its evaluations may contain errors. **Please thoroughly examine the results below prior to deciding whether to implement an agent.** ---- -## Tests -{% for test, result in zip(tests, results) -%} -- [{% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }}](#{{ test.name | replace(' ', '-') }}) -{% endfor %} - ---- - - -{% for test, result in zip(tests, results) -%} -## {% if result.success %}:green_circle:{% else %}:red_circle:{% endif %} {{ test.name }} - -**Steps** -{% for step in test.steps -%} -{{ loop.index }}. {{ step }} -{% endfor %} - -**Expected results** -{% for result in test.expected_results -%} -{{ loop.index }}. {{ result }} -{% endfor %} - -**Conversation** -``` -{% for sender, message in result.conversation -%} -[{{ sender }}] {{ message }} -{% endfor -%} -``` - -**Result** -{{ result.result }} - -**Reasoning** -``` -{{ result.reasoning }} -``` - ---- -{% endfor %} - - - - - diff --git a/stepfunctions/stepfunctions/agenteval/test.py b/stepfunctions/stepfunctions/agenteval/test.py deleted file mode 100644 index 695f2fe..0000000 --- a/stepfunctions/stepfunctions/agenteval/test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -from pydantic import BaseModel - - -class Test(BaseModel, validate_assignment=True): - """A test case for an agent. - - Attributes: - name: Name of the test. - steps: List of step to perform for the test. - expected_results: List of expected results for the test. - initial_prompt: Optional initial prompt. - max_turns: Maximum number of turns allowed for the test. - hook: The module path to an evaluation hook. - """ - - # do not collect as a test - __test__ = False - - name: str - steps: list[str] - expected_results: list[str] - initial_prompt: Optional[str] = None - max_turns: int - hook: Optional[str] = None diff --git a/stepfunctions/stepfunctions/agenteval/test_result.py b/stepfunctions/stepfunctions/agenteval/test_result.py deleted file mode 100644 index 5258aef..0000000 --- a/stepfunctions/stepfunctions/agenteval/test_result.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from pydantic import BaseModel - -from agenteval.conversation import Conversation - - -class TestResult(BaseModel, arbitrary_types_allowed=True): - """The result of a test. - - Attributes: - test_name: Name of the test. - result: Description of the test result. - reasoning: The rationale for the test result. - success: `True` if the test passed, otherwise `False`. - conversation: Captures the interaction between a user and an agent. - """ - - # do not collect as a test - __test__ = False - - test_name: str - result: str - reasoning: str - success: bool - conversation: Conversation diff --git a/stepfunctions/stepfunctions/agenteval/trace.py b/stepfunctions/stepfunctions/agenteval/trace.py deleted file mode 100644 index 25d477a..0000000 --- a/stepfunctions/stepfunctions/agenteval/trace.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -import inspect -import json -import os -from datetime import datetime, timezone -from typing import Optional - -_TRACE_DIR = "agenteval_traces" - - -class Trace: - """Captures steps during evaluation. - - Attributes: - test_name (str): Name of the test. - trace_dir (str): Directory to store the trace. - start_time (datetime): Start time of the trace. - end_time (datetime): End time of the trace. - steps (list): List of steps in the trace. - - """ - - def __init__(self, test_name: str, work_dir: str): - """ - Initialize the trace handler. - - Args: - test_name (str): Name of the trace - """ - self.test_name = test_name - self.trace_dir = os.path.join(work_dir, _TRACE_DIR) - self.start_time = None - self.end_time = None - self.steps = [] - - def __enter__(self): - self.start_time = datetime.now(timezone.utc) - return self - - def __exit__(self, *exc): - self.end_time = datetime.now(timezone.utc) - self._dump_trace() - - def _dump_trace(self): - """Dump the trace to a JSON file.""" - - os.makedirs(self.trace_dir, exist_ok=True) - - with open(os.path.join(self.trace_dir, f"{self.test_name}.json"), "w") as f: - json.dump(self._get_trace(), f, default=str) - - def _get_trace(self) -> str: - return { - "test_name": self.test_name, - "start_time": self.start_time, - "end_time": self.end_time, - "steps": self.steps, - } - - def add_step(self, step_name: Optional[str] = None, **kwargs): - """Add a step to the trace. - - Args: - step_name (str, optional): The name of the step. Defaults to - the name of the caller function - """ - step_name = step_name or inspect.stack()[1].function - step = {"timestamp": datetime.now(timezone.utc), "step_name": step_name} - step.update(kwargs) - self.steps.append(step) diff --git a/stepfunctions/stepfunctions/agenteval/utils/__init__.py b/stepfunctions/stepfunctions/agenteval/utils/__init__.py deleted file mode 100644 index 5f80a10..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from .aws import create_boto3_client -from .imports import import_class - -__all__ = ["import_class", "create_boto3_client"] diff --git a/stepfunctions/stepfunctions/agenteval/utils/aws.py b/stepfunctions/stepfunctions/agenteval/utils/aws.py deleted file mode 100644 index 4d5d4dd..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/aws.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from typing import Optional - -import boto3 -from botocore.client import BaseClient -from botocore.config import Config - -_RETRY_MODE = "adaptive" - - -def create_boto3_client( - boto3_service_name: str, - aws_profile: Optional[str], - aws_region: Optional[str], - endpoint_url: Optional[str], - max_retry: int, -) -> BaseClient: - """Create a `boto3` client. - - Args: - boto3_service_name (str): The `boto3` service name (e.g `"bedrock-runtime"`). - aws_profile (str, optional): The AWS profile name. - aws_region (str, optional): The AWS region. - endpoint_url (str, optional): The endpoint URL for the AWS service. - max_retry (int, optional): The maximum number of retry attempts. - - Returns: - BaseClient - """ - - config = Config(retries={"max_attempts": max_retry, "mode": _RETRY_MODE}) - - session = boto3.Session(profile_name=aws_profile, region_name=aws_region) - return session.client(boto3_service_name, endpoint_url=endpoint_url, config=config) diff --git a/stepfunctions/stepfunctions/agenteval/utils/imports.py b/stepfunctions/stepfunctions/agenteval/utils/imports.py deleted file mode 100644 index f0e2685..0000000 --- a/stepfunctions/stepfunctions/agenteval/utils/imports.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -from importlib import import_module -from typing import Optional - -_ALLOWED_MODULE_NAME_SUFFIX = ["_hook", "_target"] - - -def import_class(module_path: str, parent_class: Optional[type] = None) -> type: - name, class_name = module_path.rsplit(".", 1) - - # make sure module name starts with one of the allowed suffixes - _validate_module_name(name.split(".")[-1]) - - module = import_module(name) - cls = getattr(module, class_name) - - if parent_class: - # make sure the imported class is a subclass - _validate_subclass(cls, parent_class) - - return cls - - -def _validate_module_name(name: str) -> None: - if not any(name.endswith(suffix) for suffix in _ALLOWED_MODULE_NAME_SUFFIX): - raise ValueError(f"Invalid module name: {name}") - - -def _validate_subclass(child_class: type, parent_class: type) -> None: - if not issubclass(child_class, parent_class): - raise TypeError( - f"{child_class.__name__} is not a {parent_class.__name__} subclass" - ) diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py deleted file mode 100644 index 8f677c7..0000000 --- a/stepfunctions/stepfunctions/functions/check_agent_status_1/.~c9_invoke_Zi2ZN1.py +++ /dev/null @@ -1,43 +0,0 @@ -import boto3 -import json -import os - -s3_client = boto3.client('s3') -bedrock_agent = boto3.client('bedrock-agent') - -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() -def handler(event, context) - - agent_id = event["agent_id"] - - response = bedrock_agent.get_agent( - agentId='string' - ) - - agent_status = response["Agent"]["AgentStatus"] - - return { - 'statusCode': 200, - 'agent_id': agent_id, - 'agent_status': agent_status, - 'agent_name': text["agent_name"], - 'body': scenarios - } - - - - - - - - - - - - - - - diff --git a/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py b/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py deleted file mode 100644 index 8f677c7..0000000 --- a/stepfunctions/stepfunctions/functions/check_agent_status_2/.~c9_invoke_Zi2ZN1.py +++ /dev/null @@ -1,43 +0,0 @@ -import boto3 -import json -import os - -s3_client = boto3.client('s3') -bedrock_agent = boto3.client('bedrock-agent') - -# from aws_lambda_powertools import Logger, Tracer - -# tracer = Tracer() -# logger = Logger() -def handler(event, context) - - agent_id = event["agent_id"] - - response = bedrock_agent.get_agent( - agentId='string' - ) - - agent_status = response["Agent"]["AgentStatus"] - - return { - 'statusCode': 200, - 'agent_id': agent_id, - 'agent_status': agent_status, - 'agent_name': text["agent_name"], - 'body': scenarios - } - - - - - - - - - - - - - - -