But just in case, this was how I did it last time using a very old version of LlamaIndex
import torch
import transformers
from transformers import (
StoppingCriteria,
StoppingCriteriaList,
TextStreamer
)
from llama_index.llms.huggingface import HuggingFaceLLM
from langchain_community.llms import HuggingFacePipeline
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
hf_token = os.getenv("HUGGINGFACE_API_TOKEN")
model_config = transformers.AutoConfig.from_pretrained(model_id, use_auth_token=hf_token)
model = transformers.AutoModelForCausalLM.from_pretrained(model_id,
trust_remote_code = True,
config = model_config,
quantization_config = <your bnb config>,
device_map = device,
use_auth_token = hf_token,
cache_dir = "path to downloaded model"
)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, use_auth_token = hf_token,cache_dir = "path to downloaded model")
llm = HuggingFaceLLM(context_window = ..., max_new_tokens = ..., system_prompt=..., model=..., tokenizer =...)
## Wrap to langchain pipeline
stop_list = ['\n`\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_ids in stop_token_ids:
if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
return True
return False
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
streamer = TextStreamer(tokenizer, skip_prompt = True)
pipeline = transformers.pipeline(model = model,tokenizer = tokenizer,return_full_text = True,task = "text-generation",
stopping_criteria = stopping_criteria,
streamer = streamer,
temperature = temperature,
repetition_penalty = 1.1
)
langchain_pipeline = HuggingFacePipeline(pipeline=pipeline)