from llama_index.core.evaluation import ( CorrectnessEvaluator, SemanticSimilarityEvaluator, RelevancyEvaluator, FaithfulnessEvaluator, PairwiseComparisonEvaluator, ) from llama_index.core.evaluation.eval_utils import ( get_responses, get_results_df, ) from llama_index.core.evaluation import BatchEvalRunner import numpy as np evaluator_c = CorrectnessEvaluator(llm=eval_llm) evaluator_s = SemanticSimilarityEvaluator() evaluator_r = RelevancyEvaluator(llm=eval_llm) evaluator_f = FaithfulnessEvaluator(llm=eval_llm) pairwise_evaluator = PairwiseComparisonEvaluator(llm=eval_llm) max_samples = 5 eval_qs = eval_dataset.questions qr_pairs = eval_dataset.qr_pairs ref_response_strs = [r for (_, r) in qr_pairs] base_query_engine = vector_indices[-1].as_query_engine(similarity_top_k=2) query_engine = RetrieverQueryEngine(retriever, node_postprocessors=[reranker]) base_pred_responses = get_responses( eval_qs[:max_samples], base_query_engine, show_progress=True ) pred_responses = get_responses( eval_qs[:max_samples], query_engine, show_progress=True ) sponse_strs = [str(p) for p in pred_responses] base_pred_response_strs = [str(p) for p in base_pred_responses] evaluator_dict = { "correctness": evaluator_c, "faithfulness": evaluator_f, "semantic_similarity": evaluator_s, } batch_runner = BatchEvalRunner(evaluator_dict, workers=1, show_progress=True) eval_results = await batch_runner.aevaluate_responses( queries=eval_qs[:max_samples], responses=pred_responses[:max_samples], reference=ref_response_strs[:max_samples], ) base_eval_results = await batch_runner.aevaluate_responses( queries=eval_qs[:max_samples], responses=base_pred_responses[:max_samples], reference=ref_response_strs[:max_samples], ) results_df = get_results_df( [eval_results, base_eval_results], ["Ensemble Retriever", "Base Retriever"], ["correctness", "faithfulness", "semantic_similarity"], ) display(results_df)