Hey guys, I am figuring out to show realtime events (thought events?? like "Thinking", "Retriving reviews", etc...) along with token streaming. How to stream tokens as well as the realtime thought process of ReActAgent.from_tools?
I have very little idea on how to pull this off. any direction would be helpful!
vertex_gemini = Vertex(
model="gemini-1.5-pro",
context_window=100000,
temperature=0,
additional_kwargs={},
)
# setup the index/query process, ie the embedding model (and completion if used)
Settings.llm = vertex_gemini
# setup the index/query process, ie the embedding model (and completion if used)
Settings.embed_model = embed_model
# Create a Pinecone-based vector store for detail
detail_vector_store = PineconeVectorStore(
pinecone_index=pinecone_index,
namespace=DETAIL_NAMESPACE
)
detail_storage_context = StorageContext.from_defaults(vector_store=detail_vector_store)
vector_index = VectorStoreIndex.from_vector_store(
vector_store=detail_vector_store,
storage_context=detail_storage_context
)
print("VectorStoreIndex (detail) created.")
detail_query_engine = vector_index.as_query_engine()
detail_tool = QueryEngineTool.from_defaults(
query_engine=detail_query_engine,
name="detail_vector_index",
description="Use this to answer questions about the full text of the reviews."
)
agent = ReActAgent.from_tools(tools=[detail_tool], verbose=True)
@app.post("/chat-stream")
def chat_endpoint(chat_request: ChatRequest):
user_query = chat_request.query
def token_generator():
response = agent.stream_chat(user_query)
for delta in response.response_gen:
print(delta)
yield delta
return StreamingResponse(token_generator(), media_type="text/plain")