You could try running some metrics. I use Ollama local. With their OpenAPI support you save a lot of money running observability metrics. I just got it working yesterday so i haven't had time to measure the above.
queries_df = get_qa_with_reference(px.Client())
retrieved_documents_df = get_retrieved_documents(px.Client())
eval_model = OpenAIModel(
api_key="ollama",
base_url="http://192.168.0.109:1234/v1/",
model="<model>",
)
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_correctness_evaluator = QAEvaluator(eval_model)
relevance_evaluator = RelevanceEvaluator(eval_model)
hallucination_eval_df, qa_correctness_eval_df = run_evals(
dataframe=queries_df,
evaluators=[hallucination_evaluator, qa_correctness_evaluator],
provide_explanation=True,)
relevance_eval_df = run_evals(
dataframe=retrieved_documents_df,
evaluators=[relevance_evaluator],
provide_explanation=True)[0]
px.Client().log_evaluations(
SpanEvaluations(eval_name="Hallucination", dataframe=hallucination_eval_df),
SpanEvaluations(eval_name="QA Correctness", dataframe=qa_correctness_eval_df),
DocumentEvaluations(eval_name="Relevance", dataframe=relevance_eval_df)
)