tonytonfisk

Is it possible to clear the gpu vram somehow after doing a query? Using the llamacpp currently, and the vram is just stuck at 60% even after trying to run torch.cuda.empty_cache() and deleting the model. Idk if anyone knows anything pls lmk ty!

Plain Text

 try:
        documents = load_data(file_path)
        llm = LlamaCPP(
        # optionally, you can set the path to a pre-downloaded model instead of model_url
        model_path="models/mistral7b",
        temperature=0.1,
        max_new_tokens=256,
        # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
        context_window=3900,
        # kwargs to pass to __call__()
        generate_kwargs={},
        # kwargs to pass to __init__()
        # set to at least 1 to use GPU
        model_kwargs={"n_gpu_layers": 18}, #MAX 35 layers can be offloaded to GPU if using mistral 7b
        # transform inputs into Llama2 format
        messages_to_prompt=messages_to_prompt,
        completion_to_prompt=completion_to_prompt,
        verbose=True,
    )
        service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", context_window = 3700)
        list_index = ListIndex.from_documents(documents, service_context=service_context)

        query_engine = list_index.as_query_engine(response_mode="tree_summarize")
        response = query_engine.query("Summarize in detail.")
        return response
    except Exception as e:
       print(e)
    finally:
        del llm
        del service_context
        del list_index
        del query_engine
        del response
        torch.cuda.empty_cache()  
        gc.collect()

gpu memory usage still stuck and wont clear

Plain Text

def model(documents):
    llm = LlamaCPP(
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="models/mistral7b",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

    service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", context_window = 3700, chunk_size = 2048)
    list_index = ListIndex.from_documents(documents, service_context=service_context)
  
    query_engine = list_index.as_query_engine(response_mode="tree_summarize")
    response = query_engine.query("Summarize in detail.")
    return response

An error occurred: Requested tokens (3974) exceed context window of 3900? Anyone knows any fixes? 🤯

Find answers from the community

Is it possible to clear the gpu vram

```def model(documents):