Find answers from the community

Updated 3 months ago

and its so time consuming to generate

and its so time consuming to generate nodes everytime on same data
B
a
W
10 comments
what vector store are you using? Most vectorstores have disk persistence options
i m using bm25 + vector retriever created using index
not sure how to create a vector store using a hybrid retreiver
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=2)

vector query engine

vector_query_engine = RetrieverQueryEngine(
retriever=vector_retriever,
response_synthesizer=response_synthesizer,
)


index = VectorStoreIndex(
nodes=nodes,
storage_context=storage_context,
service_context=service_context,
)

retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)

Custom Retriever Implementation
from llama_index.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
def init(self, vector_retriever, bm25_retriever):
self.vector_retriever = vector_retriever
self.bm25_retriever = bm25_retriever
super().init()

def _retrieve(self, query, kwargs): bm25_nodes = self.bm25_retriever.retrieve(query, kwargs)
vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

# combine the two lists of nodes
all_nodes = []
node_ids = set()
for n in bm25_nodes + vector_nodes:
if n.node.node_id not in node_ids:
all_nodes.append(n)
node_ids.add(n.node.node_id)
return all_nodes
index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)
You can persist these nodes using the index and then in next iteration pick those nodes again.

Plain Text
#persist the nodes
index.storage_context.persist()
....
# load the persist nodes
index = load_index_from_storage(StorageContext.from_defaults(), service_context=service_context)

# get nodes
nodes = index.docstore.docs

#use it further in your retriever
let me try this
---> 10 bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=k)
11 from llama_index.response.notebook_utils import display_source_node
12 from llama_index.retrievers import BaseRetriever

File ~/anaconda3/lib/python3.9/site-packages/llama_index/retrievers/bm25_retriever.py:59, in BM25Retriever.from_defaults(cls, index, nodes, docstore, tokenizer, similarity_top_k)
54 assert (
55 nodes is not None
56 ), "Please pass exactly one of index, nodes, or docstore."
58 tokenizer = tokenizer or get_tokenizer()
---> 59 return cls(
60 nodes=nodes,
61 tokenizer=tokenizer,
62 similarity_top_k=similarity_top_k,
63 )

File ~/anaconda3/lib/python3.9/site-packages/llama_index/retrievers/bm25_retriever.py:31, in BM25Retriever.init(self, nodes, tokenizer, similarity_top_k, callback_manager)
29 self._tokenizer = tokenizer or (lambda x: x.split(" "))
30 self._similarity_top_k = similarity_top_k
---> 31 self._corpus = [self._tokenizer(node.get_content()) for node in self._nodes]
32 self.bm25 = BM25Okapi(self._corpus)
33 super().init(callback_manager)

File ~/anaconda3/lib/python3.9/site-packages/llama_index/retrievers/bm25_retriever.py:31, in <listcomp>(.0)
29 self._tokenizer = tokenizer or (lambda x: x.split(" "))
30 self._similarity_top_k = similarity_top_k
---> 31 self._corpus = [self._tokenizer(node.get_content()) for node in self._nodes]
32 self.bm25 = BM25Okapi(self._corpus)
33 super().init(callback_manager)
when i tried retrieving nodes from index and use it in bm25 retriever, i get above error
AttributeError: 'str' object has no attribute 'get_content'
Add a reply
Sign up and join the conversation on Discord