Not using pipeline, could be that?
class CustomLLM(LLM):
model_name = "bertin"
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
input = ""
temperature = 0.1
device = "cuda"
top_p = 0.75
top_k = 40
num_beams = 4
max_new_tokens = num_output
prompt = prompter.generate_prompt(prompt, input)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)
generation_config = GenerationConfig(
temperature=temperature,
top_p=top_p,
top_k=top_k
)
with torch.no_grad():
generation_output = model.generate(
input_ids=input_ids,
generation_config=generation_config,
return_dict_in_generate=True,
output_scores=True,
max_new_tokens=max_new_tokens,
)
s = generation_output.sequences[0]
output = tokenizer.decode(s)
response = prompter.get_response(output)
print(prompt)
print(response)
prompt_length = len(prompt)
# response = self.pipeline(prompt, max_new_tokens=num_output)[0]["generated_text"]
# only return newly generated tokens
return response[prompt_length:]