Skip to main content

Exact Match Evaluation

The simplest evaluation type is direct string comparison. LangChain has a prebuilt "exact match" evaluator you can use, or you can do the same with a custom evaluator.

You can check out the example results here.

Test graph

# %pip install -U --quiet langchain langchain_openai
import os

# Update with your API URL if using a hosted instance of Langsmith.
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# Update with your API key
os.environ["LANGCHAIN_API_KEY"] = "YOUR API KEY"
os.environ["OPENAI_API_KEY"] = "Your openai api key"

Create Dataset

First you create a simple dataset of input and expected output pairs.

import langsmith

client = langsmith.Client()
dataset_name = "Oracle of Exactness"
if not client.has_dataset(dataset_name=dataset_name):
ds = client.create_dataset(dataset_name)
client.create_examples(
inputs=[
{
"prompt_template": "State the year of the declaration of independence."
"Respond with just the year in digits, nothign else"
},
{"prompt_template": "What's the average speed of an unladen swallow?"},
],
outputs=[{"output": "1776"}, {"output": "5"}],
dataset_id=ds.id,
)

Evaluate

from langchain.smith import RunEvalConfig
from langchain_openai import ChatOpenAI
from langsmith.evaluation import EvaluationResult, run_evaluator

model = "gpt-3.5-turbo"


# This is your model/system that you want to evaluate
def predict_result(input_: dict) -> dict:
response = ChatOpenAI(model=model).invoke(input_["prompt_template"])
return {"output": response.content}


@run_evaluator
def compare_label(run, example) -> EvaluationResult:
# Custom evaluators let you define how "exact" the match ought to be
# It also lets you flexibly pick the fields to compare
prediction = run.outputs.get("output") or ""
target = example.outputs.get("output") or ""
match = prediction and prediction == target
return EvaluationResult(key="matches_label", score=match)


# This defines how you generate metrics about the model's performance
eval_config = RunEvalConfig(
evaluators=["exact_match"], # equivalent prebuilt evaluator
custom_evaluators=[compare_label],
)

client.run_on_dataset(
dataset_name=dataset_name,
llm_or_chain_factory=predict_result,
evaluation=eval_config,
verbose=True,
project_metadata={"version": "1.0.0", "model": model},
)
View the evaluation results for project 'impressionable-crew-29' at:
https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd/compare?selectedSessions=a0672ba4-e513-4fef-84b8-bab439581721

View all tests for Dataset Oracle of Exactness at:
https://smith.langchain.com/o/30239cd8-922f-4722-808d-897e1e722845/datasets/4f23ec54-3cf8-44fc-a729-ce08ad855bfd
[------------------------------------------------->] 2/2
<h3>Experiment Results:</h3>
feedback.exact_matchfeedback.matches_labelerrorexecution_timerun_idcountuniquetopfreqmeanstdmin25%50%75%max
2.000000202.0000002
NaN20NaN2
NaNFalseNaNNaN2b4532af-445e-46aa-8170-d34c3af724a8
NaN1NaNNaN1
0.500000NaNNaN0.545045NaN
0.707107NaNNaN0.265404NaN
0.000000NaNNaN0.357376NaN
0.250000NaNNaN0.451211NaN
0.500000NaNNaN0.545045NaN
0.750000NaNNaN0.638880NaN
1.000000NaNNaN0.732714NaN
{'project_name': 'impressionable-crew-29',
'results': {'893730f0-393d-4c40-92f9-16ce24aaec1f': {'input': {'prompt_template': "What's the average speed of an unladen swallow?"},
'feedback': [EvaluationResult(key='exact_match', score=0, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('089a016a-d847-4a26-850c-afc0e78879d5'))}, source_run_id=None, target_run_id=None),
EvaluationResult(key='matches_label', score=False, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],
'execution_time': 0.732714,
'run_id': '2b4532af-445e-46aa-8170-d34c3af724a8',
'output': {'output': 'The average speed of an unladen European swallow is approximately 20.1 miles per hour (32.4 km/h).'},
'reference': {'output': '5'}},
'ec9d8754-d264-4cec-802e-0c33513843d8': {'input': {'prompt_template': 'State the year of the declaration of independence.Respond with just the year in digits, nothign else'},
'feedback': [EvaluationResult(key='exact_match', score=1, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('cd4c7ede-f367-4d9c-b424-577bf054bf21'))}, source_run_id=None, target_run_id=None),
EvaluationResult(key='matches_label', score=True, value=None, comment=None, correction=None, evaluator_info={}, source_run_id=None, target_run_id=None)],
'execution_time': 0.357376,
'run_id': '82b65c5c-bfbf-4d2b-9c05-3bbd1cd4e711',
'output': {'output': '1776'},
'reference': {'output': '1776'}}}}


Was this page helpful?