Hi guys, sometime ago I was using llama_index with vllm and chat_engine with streaming. But recently when tried it out, I am getting "stream_chat method not implemented"
was i doing something wrong, or did it recently stopped working after some update?
here is the code that i am using
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.storage.chat_store.redis import RedisChatStore
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
data = SimpleDirectoryReader(input_dir="/workspace/data/").load_data()
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")
index = VectorStoreIndex.from_documents(data, embed_model=embed_model)
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.llms.vllm import Vllm
llm = Vllm(
model="/tmp/Qwen/1_5B",
tensor_parallel_size=1,
max_new_tokens=100,
vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.25},
)
memory = ChatMemoryBuffer.from_defaults(token_limit=1500,llm=llm)
chat_engine = index.as_chat_engine(
llm=llm,
streaming=True,
chat_mode="context",
memory=memory,
system_prompt=(
"You are a chatbot, able to have normal interactions, as well as talk"
" about an essay discussing Paul Grahams life."
),
)
# response = chat_engine.chat("Hello!, who are you")
# print(response)
response = chat_engine.stream_chat("What did Paul Graham do after YC?")
for token in response.response_gen:
print(token, end="")