I changed my function locally to test out how it would work to supply max_tokens=NUMBER which does work as I'd expect it to now
@llm_completion_callback()
def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
self.generate_kwargs.update({"stream": False})
is_formatted = kwargs.pop("formatted", False)
if not is_formatted:
prompt = self.completion_to_prompt(prompt)
# Want to at inference time decide how many tokens to generate
# Create a local copy by merging self.generate_kwargs and kwargs
local_generate_kwargs = {**self.generate_kwargs, **kwargs}
print(local_generate_kwargs)
response = self._model(prompt=prompt, **local_generate_kwargs)
return CompletionResponse(text=response["choices"][0]["text"], raw=response)
I'm just wondering if this is a bug or if I am missing something obvious