from llama_index.core.evaluation import (
CorrectnessEvaluator,
SemanticSimilarityEvaluator,
RelevancyEvaluator,
FaithfulnessEvaluator,
PairwiseComparisonEvaluator,
)
from llama_index.core.evaluation.eval_utils import (
get_responses,
get_results_df,
)
from llama_index.core.evaluation import BatchEvalRunner
import numpy as np
evaluator_c = CorrectnessEvaluator(llm=eval_llm)
evaluator_s = SemanticSimilarityEvaluator()
evaluator_r = RelevancyEvaluator(llm=eval_llm)
evaluator_f = FaithfulnessEvaluator(llm=eval_llm)
pairwise_evaluator = PairwiseComparisonEvaluator(llm=eval_llm)
max_samples = 5
eval_qs = eval_dataset.questions
qr_pairs = eval_dataset.qr_pairs
ref_response_strs = [r for (_, r) in qr_pairs]
base_query_engine = vector_indices[-1].as_query_engine(similarity_top_k=2)
query_engine = RetrieverQueryEngine(retriever, node_postprocessors=[reranker])
base_pred_responses = get_responses(
eval_qs[:max_samples], base_query_engine, show_progress=True
)
pred_responses = get_responses(
eval_qs[:max_samples], query_engine, show_progress=True
)
sponse_strs = [str(p) for p in pred_responses]
base_pred_response_strs = [str(p) for p in base_pred_responses]
evaluator_dict = {
"correctness": evaluator_c,
"faithfulness": evaluator_f,
"semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=1, show_progress=True)
eval_results = await batch_runner.aevaluate_responses(
queries=eval_qs[:max_samples],
responses=pred_responses[:max_samples],
reference=ref_response_strs[:max_samples],
)
base_eval_results = await batch_runner.aevaluate_responses(
queries=eval_qs[:max_samples],
responses=base_pred_responses[:max_samples],
reference=ref_response_strs[:max_samples],
)
results_df = get_results_df(
[eval_results, base_eval_results],
["Ensemble Retriever", "Base Retriever"],
["correctness", "faithfulness", "semantic_similarity"],
)
display(results_df)