Hi everyone, I having some issues implementing RAG using LlamaIndex with my LLama2 cpp Custom LLM Model (Docker deployed using Flask in GCP)
The main problem is multiple requests made by LlamaIndex to my API, which increase a lot the response time in comparison to a simple request.
Anyone can help with some guidance?
I'm following this example in documentation (Example: Using a Custom LLM Model - Advanced):
https://docs.llamaindex.ai/en/stable/module_guides/models/llms/usage_custom/#example-using-a-custom-llm-model---advancedclass OurLLM(CustomLLM):
context_window: int = 1200
num_output: int = 256
model_name: str = "custom"
dummy_response: str = "My response"
@property
def metadata(self) -> LLMMetadata:
"""Get LLM metadata."""
return LLMMetadata(
context_window=self.context_window,
num_output=self.num_output,
model_name=self.model_name,
)
def llama2_13b_gcp(self, role_user):
"""
"""
url = "http://00.000.000.000:8080/api/chat"
headers = {
"accept": "application/json",
"Content-Type": "application/json"
}
data = {
"messages": [
{"role": "system", "content": "Answer the question"},
{"role": "user", "content": role_user}
]
}
response = requests.post(url, headers=headers, json=data)
resp = response.json()['choices'][0]['message']['content']
return resp
@llm_completion_callback()
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
self.dummy_response = self.llama2_13b_gcp(prompt)
return CompletionResponse(text=self.dummy_response)
@llm_completion_callback()
def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
full_response = self.llama2_13b_gcp(prompt)
yield CompletionResponse(text=full_response)