class OurLLM(CustomLLM): context_window: int = 1200 num_output: int = 256 model_name: str = "custom" dummy_response: str = "My response" @property def metadata(self) -> LLMMetadata: """Get LLM metadata.""" return LLMMetadata( context_window=self.context_window, num_output=self.num_output, model_name=self.model_name, ) def llama2_13b_gcp(self, role_user): """ """ url = "http://00.000.000.000:8080/api/chat" headers = { "accept": "application/json", "Content-Type": "application/json" } data = { "messages": [ {"role": "system", "content": "Answer the question"}, {"role": "user", "content": role_user} ] } response = requests.post(url, headers=headers, json=data) resp = response.json()['choices'][0]['message']['content'] return resp @llm_completion_callback() def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: self.dummy_response = self.llama2_13b_gcp(prompt) return CompletionResponse(text=self.dummy_response) @llm_completion_callback() def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: full_response = self.llama2_13b_gcp(prompt) yield CompletionResponse(text=full_response)
ggml_allocr_alloc: not enough space in the buffer (needed 289444000, largest block available 27545600)