I do this

I do this :

Plain Text

# https://github.com/abetlen/llama-cpp-python
# GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78 --force-reinstall --upgrade --no-cache-dir --verbose

# https://github.com/run-llama/llama_index
!pip install llama-index

Plain Text

import logging
import sys

from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from llama_index.llms import LlamaCPP

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)  # Change INFO to DEBUG if you want more extensive logging
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

# https://gpt-index.readthedocs.io/en/stable/examples/llm/llama_2_llama_cpp.html
llm = LlamaCPP(
    model_url="https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf",
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    #model_path="mistral-7b-v0.1.Q4_K_M.gguf",

    temperature=0.0,
    max_new_tokens=1024,

    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,  # note, this sets n_ctx in the model_kwargs below, so you don't need to pass it there.

    # kwargs to pass to __call__()
    generate_kwargs={},

    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},

    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

Find answers from the community

I do this