from agensight.eval.metrics import TaskCompletionMetric
from agensight.eval.test_case import ModelTestCase, ToolCall
# Define the metric
task_metric = TaskCompletionMetric(
threshold=0.7,
model="gpt-4o-mini",
include_reason=True
)
# Create a test case
test_case = ModelTestCase(
input="Develop a Python script to automate data entry tasks.",
actual_output="The script automates data entry using pandas and openpyxl.",
tools_called=[
ToolCall(
name="DataEntryBot",
description="Automates data entry tasks using Python libraries.",
input_parameters={"library": "pandas", "task": "data entry"},
output=["Data entry automated using pandas and openpyxl."]
)
]
)
# Run the evaluation
task_metric.measure(test_case)
print(task_metric.score, task_metric.reason)