I'm sorry, I wasn't clear. By max_tokens I meant the parameter, which in PromptHelper is called num_output.
from llama_index import (
StorageContext,
load_index_from_storage,
Prompt,
LLMPredictor,
ServiceContext,
SimpleDirectoryReader,
VectorStoreIndex,
LangchainEmbedding,
Document,
ListIndex,
PromptHelper
)
from os import listdir
from os.path import isfile, join
import json
from llama_index.optimization.optimizer import SentenceEmbeddingOptimizer
from langchain.llms import LlamaCpp
from langchain.embeddings import HuggingFaceEmbeddings
import os
import sys
import logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
MODEL = LlamaCpp(
model_path="models/wizardlm-30B-uncensored.ggmlv3.q4_0.bin",
verbose=False,
max_tokens=512,
n_ctx=2048
)
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
EMBEDDINGS_MODEL = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
cache_folder="models/transformers/"
)
template = (
"Context: \n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
" ### Human: {query_str}\n"
"### Assistant: "
)
QA_TEMPLATE = Prompt(template)
service_context = ServiceContext.from_defaults(
llm_predictor=LLMPredictor(llm=MODEL),
embed_model=LangchainEmbedding(EMBEDDINGS_MODEL),
# In exceptional cases with these values, the program sometimes went beyond the 2048 context. The value context_window=1536 worked flawlessly.
prompt_helper=PromptHelper(context_window=2048, num_output=512),
chunk_size=512
)
storage_context = StorageContext.from_defaults()
topic_indexes = []
topic_index_summaries = []
mypath = "data"
files = [...]