Is it possible to clear the gpu vram somehow after doing a query? Using the llamacpp currently, and the vram is just stuck at 60% even after trying to run torch.cuda.empty_cache() and deleting the model. Idk if anyone knows anything pls lmk ty!
try:
documents = load_data(file_path)
llm = LlamaCPP(
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path="models/mistral7b",
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=3900,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": 18}, #MAX 35 layers can be offloaded to GPU if using mistral 7b
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-base-en-v1.5", context_window = 3700)
list_index = ListIndex.from_documents(documents, service_context=service_context)
query_engine = list_index.as_query_engine(response_mode="tree_summarize")
response = query_engine.query("Summarize in detail.")
return response
except Exception as e:
print(e)
finally:
del llm
del service_context
del list_index
del query_engine
del response
torch.cuda.empty_cache()
gc.collect()
gpu memory usage still stuck and wont clear