You can try this, I tested on a few indexes and it seemed to work decently
This entire setup will run locally... I suggest having a pretty beefy gpu lol
# define prompt helper
# set maximum input size
max_input_size = 2048
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 20
model_name = "Writer/camel-5b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
torch_dtype=torch.float16
)
PROMPT_TEMPLATE = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
)
class CustomLLM(LLM):
model_name = "Writer/camel-5b-hf"
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
prompt = prompt.strip()
text = PROMPT_TEMPLATE.format(instruction=prompt)
model_inputs = tokenizer(text, return_tensors="pt", max_length=max_input_size).to("cuda")
output_ids = model.generate(**model_inputs, max_new_tokens=num_output) #, temperature=0, do_sample=True)
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
clean_output = output_text.split("### Response:")[1].strip()
return clean_output
@property
def _identifying_params(self) -> Mapping[str, Any]:
return {"name_of_model": self.model_name}
@property
def _llm_type(self) -> str:
return "custom"
llm_predictor = LLMPredictor(llm=CustomLLM())
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap, chunk_size_limit=512)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper, chunk_size_limit=512)