model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_K_M.gguf"
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0,
max_new_tokens=512,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={"top_k": 50, "top_p": 0.95},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": 40},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
# create a service context
service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
node_parser=node_parser
)
# set up query engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
query_engine = index.as_query_engine(streaming=None,
text_qa_template=text_qa_template,
similarity_top_k=3,
response_mode = "compact",)
response = query_engine.query(f"{question}")
display_response(response)
context_window
a bit, to maybe 3700? The token counting might be a little innacurate