from agensight.eval.metrics import GEvalEvaluator
from agensight.eval.test_case import ModelTestCase
# Define the metric
correctness_metric = GEvalEvaluator(
name="Code Correctness",
criteria="Evaluate whether the generated code correctly implements the specified requirements.",
threshold=0.8
)
# Create a test case
test_case = ModelTestCase(
input="Write a function to add two numbers.",
actual_output="def add(a, b): return a + b",
expected_output="A function that correctly adds two numbers."
)
# Run the evaluation
correctness_metric.measure(test_case)
print(correctness_metric.score, correctness_metric.reason)