'LLMPredictor' object has no attribute '_llm'
error when attempting to perform RAG inference on an index within a demo web app im building with Plotly Dash.File "D:\LLM_Work\llm-server-webapp\bring-your-own-documents\app-bring-your-own-docs3.py", line 319, in response_stream yield from (line for line in query_engine.query(user_question).response_gen) File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\core\base_query_engine.py", line 30, in query return self._query(str_or_query_bundle) File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\query_engine\retriever_query_engine.py", line 171, in _query response = self._response_synthesizer.synthesize( File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\response_synthesizers\base.py", line 146, in synthesize response_str = self.get_response( File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\response_synthesizers\compact_and_refine.py", line 38, in get_response return super().get_response( File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\response_synthesizers\refine.py", line 146, in get_response response = self._give_response_single( File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\response_synthesizers\refine.py", line 194, in _give_response_single program = self._program_factory(text_qa_template) File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\response_synthesizers\refine.py", line 177, in _default_program_factory llm=self._service_context.llm, File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\service_context.py", line 322, in llm return self.llm_predictor.llm File "D:\LLM_Work\llm-server-webapp\.venv\lib\site-packages\llama_index\llm_predictor\base.py", line 143, in llm return self._llm AttributeError: 'LLMPredictor' object has no attribute '_llm'
@app.server.route("/esic-rag/streaming-chat", methods=["POST"]) def streaming_chat(): sys_prompt = request.json["sys_prompt"] user_prompt = request.json["prompt"] user_question = request.json["question"] sim_top_k = request.json["sim_top_k"] session_id = request.json["session_id"] llm = OpenAILike( model="local:llama-2-13b-chat.Q4_K_S", api_base="http://localhost:8000/v1", api_key="fake", api_type="fake", max_tokens=3900, is_chat_model=True ) service_context = ServiceContext.from_defaults(llm=llm, embed_model="local:BAAI/bge-small-en-v1.5", chunk_size=256, num_output=256) set_global_service_context(service_context) index = pickle.loads(cache.get(session_id)) cache.set(session_id, pickle.dumps(index)) # Create a system message system_message = ChatMessage(role=MessageRole.SYSTEM, content=sys_prompt) user_prompt = ChatMessage(role=MessageRole.USER, content=user_prompt) text_qa_template = ChatPromptTemplate(message_templates=[system_message, user_prompt]) ### QUERY ENGINE SECTION query_engine = index.as_query_engine(streaming=True, text_qa_template=text_qa_template, similarity_top_k=sim_top_k) def response_stream(): yield from (line for line in query_engine.query(user_question).response_gen) return Response(response_stream(), mimetype="text/response-stream")
index
in the filesystem whereas before I was creating the index more globally without the need to cache different indices.. self._llm
is nonexistent all of the sudden π€@app.callback( [Output('context-results', 'children'), Output('search-result-header', 'children')], Input("submit-prompt", "n_clicks"), [State('text-question', 'value'), State("num-excerpts", "value"), State('session-id', 'data')] ) def query_vector_db(clicks, question, sim_top_k, session_id): if clicks == None: raise PreventUpdate # get previously created cached index from filesystem index = pickle.loads(cache.get(session_id)) retriever = index.as_retriever(similarity_top_k=sim_top_k) nodes = retriever.retrieve(question) search_results = [] for node in nodes: doc = node.metadata['file_name'] page = node.metadata['page_label'] url = 'https://www.hello.com/00000000' + doc.split('_')[0] excerpt_text = node.text search_results.append( dbc.Card( [ dbc.CardBody( [ html.H4(doc, className="card-title"), html.H6('Page ' + page, className="card-title"), html.P(excerpt_text, className="card-text"), dbc.Button("Get the Report", color="primary", href=url, target='_blank'), ] ), ], ) ) return [search_results, "Top %d Search Results (input context for LLM):" % sim_top_k]
import json # "save" storage_string = json.dumps(index.storage_context.to_dict()) # "load" from llama_index import load_index_from_storage, StorageContext storage_dict = json.loads(storage_string) storage_context = StorageContext.from_dict(storage_dict) # optional service context index = load_index_from_storage(storage_context, service_context=service_context)
VectorStoreIndex.from_vector_store(vector_store)
storage_context = StorageContext.from_dict(storage_dict)
step is taking many seconds.