Hi, I want to use chroma db instead of in memeory (InMemoryDocumentStore) for generation of test data using RAGAS
documents = SimpleDirectoryReader(
input_files=[
"../data/Xceed Fraud Detection Reference Manual_01March2023_Track Changes.docx",
"../data/FraudDESK Guide - ACH_v6.pdf",
"../data/FraudDESK Guide_Online Banking_v6.pdf",
"../data/FraudUseCases_13July2023.xlsx"
]
).load_data()
azure_model = LangchainLLMWrapper(AzureChatOpenAI(
model=config.chatgpt_model,
azure_deployment=config.openai_deployment_id,
api_key=config.openai_api_key,
azure_endpoint=config.openai_api_base,
api_version=config.openai_api_version,
)
)
embed_model = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
model=config.embed_model,
azure_deployment=config.embed_model_deployment_id,
api_key=config.openai_api_key,
azure_endpoint=config.openai_api_base,
api_version=config.openai_api_version,
))
generator_llm = azure_model
critic_llm = azure_model
splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=100)
keyphrase_extractor = KeyphraseExtractor(llm=generator_llm)
docstore = InMemoryDocumentStore(
splitter=splitter,
embeddings=embed_model,
extractor=keyphrase_extractor,
)
from ragas.testset import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
test_generator = TestsetGenerator(
generator_llm=generator_llm,
critic_llm=critic_llm,
embeddings=embed_model,
docstore=docstore,
)
testset = test_generator.generate_with_llamaindex_docs(documents=documents[5:6],
test_size=2, distributions={simple: 0.5, reasoning: 0.25,
multi_context: 0.25})
print(testset)