def model(documents):
llm = LlamaCPP(
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path="models/mistral7b",
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", context_window = 3700, chunk_size = 2048)
list_index = ListIndex.from_documents(documents, service_context=service_context)
query_engine = list_index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("Summarize in detail.")
return response
An error occurred: Requested tokens (3974) exceed context window of 3900? Anyone knows any fixes? 🤯