hi guys so I am trying to use the new Command R 4 bit model with LlamaIndex. my machine uses the model just fine using transformers code from HF, but when I tried to wrap it in LlamaIndex it is giving OOM
this is my LlamaIndex code
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from llama_index.core import PromptTemplate
import torch
# # This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<BOS_TOKEN><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{query_str}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>")
llm = HuggingFaceLLM(
context_window=16384,
max_new_tokens=4096,
generate_kwargs={"temperature": 0.7, "do_sample": True},
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name="CohereForAI/c4ai-command-r-v01-4bit",
model_name="CohereForAI/c4ai-command-r-v01-4bit",
device_map="auto",
# tokenizer_kwargs={"max_length": 4096},
# uncomment this if using CUDA to reduce memory usage
# model_kwargs={"torch_dtype": torch.float16}
)
Settings.llm = llm
Settings.chunk_size = 1024
documents = SimpleDirectoryReader('data').load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
print(query_engine.query("Could you summarize the given context in 3 paragraphs? Return your response which covers the key points of the text and does not miss anything important, please."))
the error message
ValueError:
Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to
`from_pretrained`.