--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In[20], line 1 ----> 1 response_header = header_query_engine.query(new_header_query) File <hidden_path>\.venv\lib\site-packages\llama_index\core\instrumentation\dispatcher.py:260, in Dispatcher.span.<locals>.wrapper(func, instance, args, kwargs) 252 self.span_enter( 253 id_=id_, 254 bound_args=bound_args, (...) 257 tags=tags, 258 ) 259 try: --> 260 result = func(*args, **kwargs) 261 except BaseException as e: 262 self.event(SpanDropEvent(span_id=id_, err_str=str(e))) File <hidden_path>\.venv\lib\site-packages\llama_index\core\base\base_query_engine.py:52, in BaseQueryEngine.query(self, str_or_query_bundle) 50 if isinstance(str_or_query_bundle, str): 51 str_or_query_bundle = QueryBundle(str_or_query_bundle) ---> 52 query_result = self._query(str_or_query_bundle) 53 dispatcher.event( 54 QueryEndEvent(query=str_or_query_bundle, response=query_result) 55 ) 56 return query_result ... --> 302 content = content_template.format(**relevant_kwargs) 304 message: ChatMessage = message_template.copy() 305 message.content = content KeyError: "' Item No"
class HeaderOutput(BaseModel): """Outputs header information containing the following categories: BillTo, BillToAddress, BillToNumber.""" BillTo: str = Field(..., description='This is the name of the customer.') BillToAddress: str = Field(..., description='This is the billing address of the customer. If \ cannot be found, extract from the company address of the customer.') BillToNumber: str = Field(..., description='This is the Tax Identification Number (TIN) of \ the customer.') struct_llm = llm.as_structured_llm(output_cls=HeaderOutput) header_query_engine = index.as_query_engine( similarity_top_k=top_k, node_postprocessor=[reranker], llm=struct_llm, verbose=False ) new_header_query = """my query""" response_header = header_query_engine.query(new_header_query)
new_header_query = """ You are an intelligent structured data extraction assistant. Your task is to extract header information from the given context. The context are part of a digitalized product invoice document. You must ALWAYS extract all header information in your response unless the information is not available in the context. If there are any information which cannot be found in the context, you need not response with that information. The context also contains table however you DO NOT need any information from tables for your extraction. You must not invent any information and perform any calculation. You must only use the information provided in the context. The header information to be extracted are: BillTo, BillToAddress, BillToNumber. Following are the requirements for the extraction: - Do not stop extraction until all categories have been extracted. - For categories not explicitly stated or given as "NA" in the context, they do NOT require extraction. Omit them from the structured response. - Do NOT invent your own categories. - Only use information provided in the context in your response."""
LLMTextCompletionProgram
but not metioned in structured_outputs.ipynb# DOCUMENT TYPE - Invoice # LAYOUT TYPE - Product # DOCUMENT PROCESSING COUNTRY - United Kingdom # DOCUMENT NUMBER - MK 1759226 # DOCUMENT DATE - 10/07/24 # CUSTOMER PO NUMBER - D71280 # SUPPLIER DETAILS - COMPANY NAME: Essentra Components - ADDRESS: 2nd Floor Hawthorne House Viking Business Park, Jarrow, Tyne & Wear, NE32 3DP - TAX IDENTIFICATION NUMBER: GB 243 2909 68 # CUSTOMER DETAILS - COMPANY NAME: Argus Powerbend - ADDRESS: Pennywell Industrial Estate, Sunderland, SR4 9EN - TAX IDENTIFICATION NUMBER: GB 938950575 # DELIVERY/SHIPPING LOCATION NAME - ARGUS # DELIVERY/SHIPPING LOCATION ADDRESS - Pennywell Ind Est, Sunderland, SR4 9EN # BILLING NAME & ADDRESS - Argus Powerbend, Pennywell Industrial Estate, Sunderland, SR4 9EN # CURRENCY CODE - GBP # CURRENCY EXCHANGE RATE - NA # TABLE OF LINE ITEMS | Item No. | Customer Item No. | Item Description | Delivery Note Number | Quantity Ordered | UoM | Quantity Shipped | Unit Price (GBP) | Total Price (GBP) | |----------|-------------------|---------------------------|----------------------|------------------|-----|------------------|------------------|-------------------| | 16247 | CAP78 | CP QUICK RLSE UNF1X14 | 1803843508 | 1.920 | TH | 1.920 | 48.7800 | 93.66 | | | | CP QUICK RLSE UNF1X14 | | | | | | | | | | Standard Delivery | | | | | | |
from llama_index.core import VectorStoreIndex from llama_index.core.node_parser import MarkdownElementNodeParser # instantiate the node parser node_parser = MarkdownElementNodeParser( llm=llm, num_workers=8, show_progress=False) # actual splitting of the document into nodes nodes = node_parser.get_nodes_from_documents([document]) # return the nodes in text and tables base_nodes, objects = node_parser.get_nodes_and_objects(nodes) # create the vector store index index = VectorStoreIndex(nodes=base_nodes+objects)
LangchainOutputParser
to increase my chances of getting structured output. Originally I was replying on prompts to get structured output which fails sometimesOptional
because sometimes the document simply do not have that information.from llama_index.core.bridge.pydantic import Field, BaseModel from typing import Optional class HeaderOutput(BaseModel): """Outputs header information containing the following categories: BillTo, BillToAddress, BillToNumber.""" BillTo: str = Field(..., description='This is the name of the customer.') BillToAddress: str = Field(..., description='This is the billing address of the customer. If \ cannot be found, extract from the company address of the customer.') BillToNumber: Optional[str] = Field(..., description='This is the Tax Identification Number (TIN) of \ the customer. Response blank if not found or "NA"')
new_header_query = """What is the HeaderOutput? Information with NA can be omitted."""
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[37], line 3 1 new_header_query = """What is the HeaderOutput? Information with NA can be omitted.""" ----> 3 response_header = header_query_engine.query(new_header_query) File \[...]\.venv\lib\site-packages\llama_index\core\instrumentation\dispatcher.py:260, in Dispatcher.span.<locals>.wrapper(func, instance, args, kwargs) 252 self.span_enter( 253 id_=id_, 254 bound_args=bound_args, (...) 257 tags=tags, 258 ) 259 try: --> 260 result = func(*args, **kwargs) 261 except BaseException as e: 262 self.event(SpanDropEvent(span_id=id_, err_str=str(e))) File \[...]\.venv\lib\site-packages\llama_index\core\base\base_query_engine.py:52, in BaseQueryEngine.query(self, str_or_query_bundle) 50 if isinstance(str_or_query_bundle, str): 51 str_or_query_bundle = QueryBundle(str_or_query_bundle) ---> 52 query_result = self._query(str_or_query_bundle) 53 dispatcher.event( 54 QueryEndEvent(query=str_or_query_bundle, response=query_result) 55 ) ... 903 ) 904 else: 905 return [] ValueError: Expected at least one tool call, but got 0 tool calls.