client = weaviate.Client( url = "http://localhost:8080", # Replace with your endpoint ) class_obj = { "class": "testing", "vectorizer": "none", # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also. } # client.schema.create_class(class_obj) llm = my_llm.get_llm() embed_model = embed_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2" ) service_context = ServiceContext.from_defaults( llm=llm, embed_model=embed_model, ) # load the blogs in using the reader doc = SimpleDirectoryReader(input_files=['./docs/Control.csv']).load_data() # chunk up the blog posts into nodes parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20) nodes = parser.get_nodes_from_documents(doc) print("-----LOGGING----- generated nodes") # construct vector store vector_store = WeaviateVectorStore(weaviate_client = client, index_name="Control", text_key="Control.csv",service_context = service_context) print("-----LOGGING----- generated vector_store") # setting up the storage for the embeddings storage_context = StorageContext.from_defaults(vector_store = vector_store) print("-----LOGGING----- generated storage_context") # set up the index index = VectorStoreIndex(nodes, storage_context = storage_context,) print("-----LOGGING----- generated index") query_engine = index.as_query_engine() response = query_engine.query("What is control?") print(response)
index = VectorStoreIndex(nodes, storage_context = storage_context, service_context=service_context)
query_engine = index.as_query_engine() response = query_engine.query("What is control?") print(response)
-----LOGGING----- generated nodes -----LOGGING----- generated vector_store -----LOGGING----- generated storage_context -----LOGGING----- generated index C:\dev\projects\OSS\criticAI\Django-server\llm\venv\lib\site-packages\pydantic\_internal\_config.py:267: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/ warnings.warn(DEPRECATION_MESSAGE, DeprecationWarning) llama_print_timings: load time = 26354.76 ms llama_print_timings: sample time = 69.35 ms / 265 runs ( 0.26 ms per token, 3821.09 tokens per second) llama_print_timings: prompt eval time = 110783.42 ms / 2085 tokens ( 53.13 ms per token, 18.82 tokens per second) llama_print_timings: eval time = 118748.08 ms / 264 runs ( 449.80 ms per token, 2.22 tokens per second) llama_print_timings: total time = 230292.76 ms Llama.generate: prefix-match hit llama_print_timings: load time = 26354.76 ms llama_print_timings: sample time = 0.18 ms / 1 runs ( 0.18 ms per token, 5405.41 tokens per second) llama_print_timings: prompt eval time = 39948.83 ms / 761 tokens ( 52.50 ms per token, 19.05 tokens per second) llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: total time = 39954.59 ms Empty Response
def get_llm(): llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically model_url=None, # optionally, you can set the path to a pre-downloaded model instead of model_url model_path="llama-2-13b-chat.Q4_0.gguf", temperature=0.1, max_new_tokens=2048, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=3900, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": 20}, #28,29,30 layers works best on my setup. # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) return llm
llama_print_timings: load time = 27512.06 ms llama_print_timings: sample time = 0.19 ms / 1 runs ( 0.19 ms per token, 5291.01 tokens per second) llama_print_timings: prompt eval time = 64437.86 ms / 1165 tokens ( 55.31 ms per token, 18.08 tokens per second) llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: total time = 64445.55 ms Llama.generate: prefix-match hit llama_print_timings: load time = 27512.06 ms llama_print_timings: sample time = 0.20 ms / 1 runs ( 0.20 ms per token, 5102.04 tokens per second) llama_print_timings: prompt eval time = 19347.85 ms / 342 tokens ( 56.57 ms per token, 17.68 tokens per second) llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: total time = 19350.54 ms Llama.generate: prefix-match hit llama_print_timings: load time = 27512.06 ms llama_print_timings: sample time = 0.20 ms / 1 runs ( 0.20 ms per token, 4926.11 tokens per second) llama_print_timings: prompt eval time = 18448.74 ms / 357 tokens ( 51.68 ms per token, 19.35 tokens per second) llama_print_timings: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_print_timings: total time = 18451.77 ms Empty Response