service_context = ServiceContext.from_defaults( File "/usr/local/lib/python3.10/dist-packages/llama_index/service_context.py", line 184, in from_defaults llm_metadata=llm_predictor.metadata, File "/usr/local/lib/python3.10/dist-packages/llama_index/llm_predictor/base.py", line 148, in metadata return self._llm.metadata AttributeError: 'OpenAI' object has no attribute 'metadata'
from llama_index.llms.vllm import Vllm vllm =Vllm(api_url=openai_api_base, model=model) service_context = ServiceContext.from_defaults( #llm = client, llm = vllm, embed_model=embed_model, )
INFO 12-17 17:57:43 llm_engine.py:73] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.1', tokenizer='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0) INFO 12-17 17:58:17 llm_engine.py:222] # GPU blocks: 0, # CPU blocks: 2048 Traceback (most recent call last): File "/home/Josh-ee_Llama_RAG/vllm-openai.py", line 35, in <module> vllm =Vllm(api_url=openai_api_base, model=model) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 158, in __init__ self._client = VLLModel( File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 93, in __init__ self.llm_engine = LLMEngine.from_engine_args(engine_args) File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 246, in from_engine_args engine = cls(*engine_configs, File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 112, in __init__ self._init_cache() File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 226, in _init_cache raise ValueError("No available memory for the cache blocks. " ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
response = chat_engine.chat("What did Paul Graham do growing up") File "/usr/local/lib/python3.10/dist-packages/llama_index/callbacks/utils.py", line 39, in wrapper return func(self, *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/chat_engine/condense_plus_context.py", line 283, in chat chat_response = self._llm.chat(chat_messages) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/base.py", line 97, in wrapped_llm_chat f_return_val = f(_self, messages, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 225, in chat completion_response = self.complete(prompt, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/base.py", line 223, in wrapped_llm_predict f_return_val = f(_self, *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 350, in complete output = get_response(response) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm_utils.py", line 9, in get_response return data["text"] KeyError: 'text'
from openai import OpenAI openai_api_key = "EMPTY" openai_api_base = "http://172.20.0.3:8000/v1" client = OpenAI( api_key=openai_api_key, base_url=openai_api_base, ) models = client.models.list() model = models.data[0].id print(model) from llama_index.embeddings.huggingface import HuggingFaceEmbedding embed_model_name = 'BAAI/bge-small-en-v1.5' embed_model = HuggingFaceEmbedding( model_name=embed_model_name, device='cuda', normalize='True' ) from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.llms.vllm import VllmServer vllm = VllmServer(api_url=openai_api_base+"/chat/completions", model="mistralai/Mistral-7B-Instruct-v0.1") service_context = ServiceContext.from_defaults( llm = vllm, embed_model=embed_model, ) path = '/RAG_VectorDB/test/' data = SimpleDirectoryReader(input_dir=path).load_data() index = VectorStoreIndex.from_documents(data, service_context=service_context) from llama_index.memory import ChatMemoryBuffer memory = ChatMemoryBuffer.from_defaults(token_limit=3900) chat_engine = index.as_chat_engine( chat_mode="condense_plus_context", memory=memory, context_prompt=( "You are a chatbot, able to have normal interactions, as well as talk" " about an essay discussing Paul Grahams life." "Here are the relevant documents for the context:\n" "{context_str}" "\nInstruction: Use the previous chat history, or the context above, to interact and help the user." ), verbose=False, ) response = chat_engine.chat("What did Paul Graham do growing up") print(response)
Traceback (most recent call last): File "/home/Josh-ee_Llama_RAG/vllm-openai.py", line 54, in <module> response = chat_engine.chat("What did Paul Graham do growing up") File "/usr/local/lib/python3.10/dist-packages/llama_index/callbacks/utils.py", line 39, in wrapper return func(self, *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/chat_engine/condense_plus_context.py", line 283, in chat chat_response = self._llm.chat(chat_messages) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/base.py", line 97, in wrapped_llm_chat f_return_val = f(_self, messages, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 225, in chat completion_response = self.complete(prompt, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/base.py", line 223, in wrapped_llm_predict f_return_val = f(_self, *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 350, in complete output = get_response(response) File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm_utils.py", line 9, in get_response return data["text"] KeyError: 'text'
Traceback (most recent call last): File "/usr/local/lib/python3.10/dist-packages/chainlit/utils.py", line 39, in wrapper return await user_function(**params_values) File "/home/Josh-ee_Llama_RAG/test-gpu.py", line 170, in main for token in response.response_gen: File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/llm.py", line 46, in gen for response in completion_response_gen: File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/base.py", line 228, in wrapped_gen for x in f_return_val: File "/usr/local/lib/python3.10/dist-packages/llama_index/llms/vllm.py", line 373, in gen yield CompletionResponse(text=data["text"][0]) KeyError: 'text'
OpenAILike
which works much betterpython -m vllm.entrypoints.openai.api_server --model "mistralai/Mistral-7B-Instruct-v0.1" --trust-remote-code
from llama_index.llms import OpenAILike from llama_index.prompts import PromptTemplate llm = OpenAILike( model="mistralai/Mistral-7B-Instruct-v0.1", api_base="http://localhost:8000/v1", api_key="fake", api_type="fake", max_tokens=256, temperatue=0.5, query_wrapper_prompt=PromptTemplate("<s>[INST] {query_str} [/INST] </s>\n") )
messages_to_prompt
function callback hook. Some examples in the notebooks here for these settings with different LLMs