Hello , I am working on a test RAG system, I am using LlamaParse and I am having some issues with the output of the RAG system. I want to know if the data that I am parsing (which I am using as test to create a template for my actual data) is good or if the errors that I am getting are due to something else.
Attached is the CSV file with a little data that I am working with.
My code looks as follows:
def indexing_function():
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")
Settings.llm = OpenAI(model="gpt-4o")
Settings.chunk_size = 250
Settings.chunk_overlap = 50
parser = LlamaParse(verbose = True, premium_mode=True, show_progress=True)
csv_file_extractor = {".csv": parser}
db = chromadb.PersistentClient(path=db_path)
test_documentation = SimpleDirectoryReader("./CSV Data", file_extractor=csv_file_extractor).load_data()
docs_collection = db.get_or_create_collection("docs")
docs_vector_store = ChromaVectorStore(chroma_collection=docs_collection)
docs_storage_context = StorageContext.from_defaults(vector_store=docs_vector_store)
index = VectorStoreIndex.from_documents(
test_documentation,
storage_context=docs_storage_context,
embed_model=OpenAIEmbedding(model="text-embedding-3-large"),
transformations=[SentenceSplitter(chunk_size=250, chunk_overlap=30)],
show_progress=True
)
return index
def citation_engine(llm_4, index):
data_engine = CitationQueryEngine.from_args(
index,
llm=llm_4,
metadata_mode=ToolMetadata(
name="test_documentation",
description=(
"Information regarding FedRAMP Controls"
),
),
similarity_top_k=2,
)
return data_engine
def query_engine(data_engine):
query_tool = QueryEngineTool(
query_engine=data_engine,
metadata=ToolMetadata(
name="docs",
description="Information regarding FedRAMP Controls"
)
)
query_engine_tools = [query_tool]
return query_engine_tools
def rag_model(prompt, query_engine_tools, llm_4):
llm_predictor = LLMPredictor(llm_4)
decompose_transform = DecomposeQueryTransform(llm_predictor, verbose=True)
react_agent = ReActAgent.from_tools(
tools = query_engine_tools,
llm = llm_4,
verbose=True
)
query_engine = MultiStepQueryEngine(
query_engine = react_agent,
query_transform=decompose_transform,
)
response = query_engine.query(
prompt
)
def main(prompt, query_engine_tools, llm_4):
response = rag_model(prompt, query_engine_tools, llm_4)
Let's say I ask a basic question such as prompt = '''Give me information on the Control "AC-1".'''
In the observation stage all I get is Observation: None of the provided sources contain information about AC-1. And spits out information about searching for the documentation online.
I've tried different values and chunk sizes, overlap sizes etc.
Thanks for your time.