Find answers from the community

Updated 10 months ago

Question:

Question:

i have a bunch of documents in a vector store, and when i do retrival from the store the score is the same for all chunks recevided from each document, 5 chunks from docA has the exact same score, then 5 chunks from docB has the exact same score, as i understand the embedding is on document level. but then how can i know wich of the 5 chunks from doc A that really has the best match score ? i could do reranking... but lets say i do top_K 200 and i have 3 documents with 100 chunks each, doc a returns 100chunks with the same score and docB returns 100 chunks with same score, but in reality 1 chunk of the 100 chunks from docC is the "right" chunk.... i dont understand
L
h
9 comments
That sounds... suspicious? Each chunk should have a unique embedding? Unless you are populating your vector store in some custom way
iam doing like this:

def RunPipe(Documents,Embed_model,Redis_host,Redis_port):
from llama_index.ingestion import (
DocstoreStrategy,
IngestionPipeline,
IngestionCache
)

from llama_index.ingestion.cache import RedisCache
from llama_index.storage.docstore import RedisDocumentStore
from llama_index.vector_stores import RedisVectorStore
from llama_index.storage.index_store import RedisIndexStore

pipeline = IngestionPipeline(
transformations = [
Embed_model
],
docstore= RedisDocumentStore.from_host_and_port(
host=Redis_host,
port=Redis_port,
namespace="REDIS_CACHE_DOCSTORE"
),
docstore_strategy=DocstoreStrategy.UPSERTS,
cache = IngestionCache(
cache=RedisCache.from_host_and_port(
host=Redis_host,
port=Redis_port,
),
collection="REDIS_CACHE_INGESTION",
)
)

Nodes = pipeline.run(documents=Documents, show_progress=True)
return Nodes


nodes = RunPipe(
Documents=documents,
Redis_host=Settings['REDIS_HOST'],
Redis_port=Settings['REDIS_PORT'],
Embed_model=Settings['Embed_Model']
)
print(f"Ingested {len(nodes)} Nodes")

Check if we have new ingested nodes (new or updated)

filtered_documents = [doc for doc in documents if doc.doc_id in [node.doc_id for node in nodes]]
print(f"New Documents to process {len(filtered_documents)} Nodes")
from llama_index.node_parser import SentenceSplitter
from llama_index import VectorStoreIndex

if filtered_documents:

print("new documents to process")

for doc in nodes:
if Document_exist(
RedisClient=Settings['Redis_Client'],
collection='REDIS_DOCSTORE/ref_doc_info',
key=doc.doc_id):

print(f'document: {doc.doc_id} Already in store, Deleting')

try:
Delete_fromVectorStore(
RedisClient=Settings['Redis_Client'],
IndexName='REDIS_VECTOR_STORE',
docID = doc.doc_id
)

print("* Document Deleted from vectorStore")
except Exception as e:
print(f'An error occurred Deleteing From Vectorstore: {type(e).name} - {e}')

try:
Settings['Storage_context_DOC'].docstore.delete_ref_doc(doc.doc_id)
print("* Document Deleted from DocStore")
except Exception as e:
print(f'An error occurred Deleteing From DocStore: {type(e).name} - {e}')

## Allways add documents to docstore and vector store
print(f'document: {doc.doc_id} Does not exist in Store, Adding...')

parsed_nodes = SentenceSplitter(
chunk_size=256,
chunk_overlap=0,
include_metadata=True).get_nodes_from_documents(documents=[doc])

Settings['Storage_context_DOC'].docstore.add_documents(parsed_nodes)

# vector Store
VectorStoreIndex(
parsed_nodes,
storage_context=Settings['Storage_context_DOC'],
service_context=Settings['Service_context']
)

else:
print("no changes in index")
sorry for the ugly code and wall of text
i figured it out, the retuned nodes from the cache run had embeddings, and when i ingested those nodes in my other funtion it didnt get replaced but when i set the .embedd to NONe it worked
could that be my issue with the bm25 retriver that the node has an Embedding property (key).. even if its empty ?
It saya bm25 does not work with embeddings
Yep thTs it:

def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
if query_bundle.custom_embedding_strs or query_bundle.embedding:
logger.warning("BM25Retriever does not support embeddings, skipping...")
Add a reply
Sign up and join the conversation on Discord