If you have a model downloaded, you can point to it like this:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=None,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path="path/to/my/model",
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": 1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
service_context = ServiceContext.from_defaults(llm=llm)
from llama_index import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir=r"C:\Users\erraballiv\PycharmProjects\Ll-index-ex1")
# NOTE: pass in service context when loading
loaded_index = load_index_from_storage(storage_context, service_context=service_context)
query_engine = loaded_index.as_query_engine()