I do this :
# https://github.com/abetlen/llama-cpp-python
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 --force-reinstall --upgrade --no-cache-dir --verbose
# https://github.com/run-llama/llama_index
!pip install llama-index
import logging
import sys
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.llms import LlamaCPP
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) # Change INFO to DEBUG if you want more extensive logging
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])
# https://gpt-index.readthedocs.io/en/stable/examples/llm/llama_2_llama_cpp.html
llm = LlamaCPP(
model_url="https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf",
# optionally, you can set the path to a pre-downloaded model instead of model_url
#model_path="mistral-7b-v0.1.Q4_K_M.gguf",
temperature=0.0,
max_new_tokens=1024,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900, # note, this sets n_ctx in the model_kwargs below, so you don't need to pass it there.
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": 1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)