pad_token
(tokenizer.pad_token = tokenizer.eos_token e.g.)
or add a new pad token via tokenizer.add_special_tokens({'pad_token': '[PAD]'})
local_model = '/ai/Mistral-7B-v0.1'
llm = HuggingFaceLLM(model_name=local_model)
embed_model = HuggingFaceEmbedding(model_name=local_model, tokenizer_name=local_model)
chroma_client = chromadb.PersistentClient()
chroma_collection = chroma_client.create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
documents = SimpleDirectoryReader("data").load_data()
VectorStoreIndex.from_documents(documents, storage_context=storage_context, service_context=service_context)
tokenizer = <your tokenizer> <configure pad token> llm = HuggingFaceLLM(tokenizer=tokenizer, ...) embed_model = HuggingFaceEmbedding(tokenizer=tokenizer, ...)
File "...site-packages/torch/nn/functional.py", line 2233, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: index out of range in self
embed_model="local"
or embed_model="BAAI/bge-small-en-v1.5"
works π€model.save_pretrained("./path/to/save")
?