Settings.chunk_size
by half as well as similarity_top_k
in the query engine to get working what worked fairly well in v.0.9.# Build vector embeddings. embed_model = "sentence-transformers/all-mpnet-base-v2" kwargs = {"device" : "cuda"} embeddings = LangchainEmbedding( HuggingFaceEmbeddings(model_kwargs = kwargs, model_name = embed_model) )
# Configure prompts. query_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>") sys_prompt = "You are a Q&A assistant. Your job is to answer questions as accurately as possible based ONLY on context based on document knowledge. Never provide answers not found in the context!"
# Configure quantization settings. bnb_config = BitsAndBytesConfig(load_in_8bit = True)
# Initiate LLaMA-2 model. llama = HuggingFaceLLM( context_window = 4096, device_map = "auto", generate_kwargs = {"do_sample" : True, "temperature" : 0.1}, max_new_tokens = 256, model_kwargs = {"quantization_config" : bnb_config, "torch_dtype" : torch.float16}, model_name = "meta-llama/Llama-2-13b-chat-hf", query_wrapper_prompt = query_prompt, system_prompt = sys_prompt, tokenizer_name = "meta-llama/Llama-2-13b-chat-hf", tokenizer_kwargs = {"max_length" : 4096}, )
# Initialize LlamaIndex global settings. Settings.chunk_overlap = 20 Settings.chunk_size = 512 Settings.embed_model = embeddings Settings.llm = llama
# Initialize storage context. storage = StorageContext.from_defaults(persist_dir = "./datasets/data")
# Load index from local storage. index = load_index_from_storage(show_progress = True, storage_context = storage)
query_engine = index.as_query_engine(similarity_top_k = 3, streaming = True)
answer = query_engine.query("Who are the Teenage Mutant Ninja Turtles?") answer.print_response_stream()
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 36.00 MiB. GPU 0 has a total capacity of 21.99 GiB of which 31.00 MiB is free. Process 10182 has 21.95 GiB memory in use. Of the allocated memory 18.37 GiB is allocated by PyTorch, and 3.26 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.
context_window
size seems to have a much more significant impact on the memory footprint than chunk_size
and similarity_top_k