That sounds... suspicious? Each chunk should have a unique embedding? Unless you are populating your vector store in some custom way
iam doing like this:
def RunPipe(Documents,Embed_model,Redis_host,Redis_port):
from llama_index.ingestion import (
DocstoreStrategy,
IngestionPipeline,
IngestionCache
)
from llama_index.ingestion.cache import RedisCache
from llama_index.storage.docstore import RedisDocumentStore
from llama_index.vector_stores import RedisVectorStore
from llama_index.storage.index_store import RedisIndexStore
pipeline = IngestionPipeline(
transformations = [
Embed_model
],
docstore= RedisDocumentStore.from_host_and_port(
host=Redis_host,
port=Redis_port,
namespace="REDIS_CACHE_DOCSTORE"
),
docstore_strategy=DocstoreStrategy.UPSERTS,
cache = IngestionCache(
cache=RedisCache.from_host_and_port(
host=Redis_host,
port=Redis_port,
),
collection="REDIS_CACHE_INGESTION",
)
)
Nodes = pipeline.run(documents=Documents, show_progress=True)
return Nodes
nodes = RunPipe(
Documents=documents,
Redis_host=Settings['REDIS_HOST'],
Redis_port=Settings['REDIS_PORT'],
Embed_model=Settings['Embed_Model']
)
print(f"Ingested {len(nodes)} Nodes")
Check if we have new ingested nodes (new or updated)
filtered_documents = [doc for doc in documents if doc.doc_id in [node.doc_id for node in nodes]]
print(f"New Documents to process {len(filtered_documents)} Nodes")from llama_index.node_parser import SentenceSplitter
from llama_index import VectorStoreIndex
if filtered_documents:
print("new documents to process")
for doc in nodes:
if Document_exist(
RedisClient=Settings['Redis_Client'],
collection='REDIS_DOCSTORE/ref_doc_info',
key=doc.doc_id):
print(f'document: {doc.doc_id} Already in store, Deleting')
try:
Delete_fromVectorStore(
RedisClient=Settings['Redis_Client'],
IndexName='REDIS_VECTOR_STORE',
docID = doc.doc_id
)
print("* Document Deleted from vectorStore")
except Exception as e:
print(f'An error occurred Deleteing From Vectorstore: {type(e).name} - {e}')
try:
Settings['Storage_context_DOC'].docstore.delete_ref_doc(doc.doc_id)
print("* Document Deleted from DocStore")
except Exception as e:
print(f'An error occurred Deleteing From DocStore: {type(e).name} - {e}')
## Allways add documents to docstore and vector store
print(f'document: {doc.doc_id} Does not exist in Store, Adding...')
parsed_nodes = SentenceSplitter(
chunk_size=256,
chunk_overlap=0,
include_metadata=True).get_nodes_from_documents(documents=[doc])
Settings['Storage_context_DOC'].docstore.add_documents(parsed_nodes)
# vector Store
VectorStoreIndex(
parsed_nodes,
storage_context=Settings['Storage_context_DOC'],
service_context=Settings['Service_context']
)
else:
print("no changes in index")
sorry for the ugly code and wall of text
i figured it out, the retuned nodes from the cache run had embeddings, and when i ingested those nodes in my other funtion it didnt get replaced but when i set the .embedd to NONe it worked
could that be my issue with the bm25 retriver that the node has an Embedding property (key).. even if its empty ?
It saya bm25 does not work with embeddings
Yep thTs it:
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
if query_bundle.custom_embedding_strs or query_bundle.embedding:
logger.warning("BM25Retriever does not support embeddings, skipping...")