Got a larger chunk overlap (-3) than chunk size (-39), should be smaller.
def initialize_index(): global index llm_predictor = HuggingFaceLLMPredictor( max_input_size=512, max_new_tokens=512, tokenizer_name="facebook/opt-iml-max-1.3b", model_name="facebook/opt-iml-max-1.3b", model_kwargs={"load_in_8bit": True}, generate_kwargs={ "do_sample": True, "top_k": 4, "penalty_alpha": 0.6, } ) prompt_helper = PromptHelper(context_window=512, chunk_size_limit=256, num_output=512) embed_model = LangchainEmbedding(HuggingFaceEmbeddings()) service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper) if os.path.exists("../indices"): storage_context = StorageContext.from_defaults(persist_dir="../indices") index = load_index_from_storage(storage_context=storage_context, service_context=service_context) else: storage_context = StorageContext.from_defaults() documents = SimpleDirectoryReader("../data").load_data() index = GPTListIndex.from_documents(documents=documents, service_context=service_context, storage_context=storage_context) index.set_index_id("paul_graham_essay") index.storage_context.persist("../indices") return index, service_context
class LocalOptModel(LLM): model_name = "facebook/opt-iml-max-1.3b" generation_pipeline = pipeline("text-generation", model=model_name, model_kwargs={"load_in_8bit": True, "device_map":"auto"}) def _call(self, prompt: str, stop: List[str]|None=None) -> str: prompt_len = len(prompt) response = self.generation_pipeline(prompt, do_sample=True, max_new_tokens=256, top_k=4, penalty_alpha=.6)[0]["generated_text"] return response[prompt_len:] @property def _identifying_params(self) -> Mapping[str, Any]: return {"name_of_model":self.model_name} @property def _llm_type(self) -> str: return "custom"
def initialize_index(): global index, service_context llm_predictor = LLMPredictor(llm=LocalOptModel(verbose=True)) prompt_helper = PromptHelper(context_window=512, chunk_size_limit=200, num_output=100) embed_model = LangchainEmbedding(HuggingFaceEmbeddings()) service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper) if os.path.exists("../indices"): storage_context = StorageContext.from_defaults(persist_dir="../indices") index = load_index_from_storage(storage_context=storage_context, service_context=service_context) else: storage_context = StorageContext.from_defaults() documents = SimpleDirectoryReader("../data").load_data() index = GPTVectorStoreIndex.from_documents(documents=documents, service_context=service_context, storage_context=storage_context) index.set_index_id("paul_graham_essay") index.storage_context.persist(persist_dir="../indices") return index, service_context
def generateResponse(): global index, service_context try: chat_engine = index.as_chat_engine( chat_mode="react", verbose=True, service_context=service_context ) response = chat_engine.chat(request.args.get("q", None)) print(response) return {"message": "Success", "status": 200, "data" : response} except Exception as e: print(traceback.format_exc()) return {"message": "Request could not be processed", "status":503}
langchain.schema.OutputParserException: Could not parse LLM output: Previous conversation history: