summary_extractor = SummaryExtractor(summaries=["prev", "self", "next"], llm=llm) questions_answered_extractor = QuestionsAnsweredExtractor( questions=3, llm=llm, metadata_mode=MetadataMode.EMBED ) title_extractor = TitleExtractor(llm=llm, nodes=5) keyword_extractor = KeywordExtractor(llm=llm, keywords=10) transformations = [node_parser, summary_extractor, questions_answered_extractor, title_extractor, keyword_extractor ] llama_logger = LlamaLogger() service_context = ServiceContext.from_defaults( callback_manager=callback_manager, llm=llm, embed_model=embedding_model, node_parser=node_parser, llama_logger=llama_logger, transformations=transformations, )
{ "page_label":"1", "file_name":"....pdf", "db_document_id":"...", "patient_id":"...", "conversation_id":"...", "section_summary":"The key topics and entities in this section are:\n\n1. Patient Information:\n- Name: ...", "questions_this_excerpt_can_answer":"1. What is the primary insurance provider for ...", "document_title":"Insurance Data for Susan Ardmore Underwood", "excerpt_keywords":"SS, Date of Birth, Phone, Address, Zip, City, Employer, ...", "_node_content":"...", "_node_type":"TextNode", "document_id":"...", "doc_id":"...", "ref_doc_id":"..." }
excluded_llm_metadata_keys
?from_documents()
or something else?fetch_and_read_document
to read the documents and then build an index like this:llama_index_docs = [] index = VectorStoreIndex.from_documents( [], storage_context=storage_context, service_context=service_context, show_progress=True, ) for doc in conversation.documents: try: llama_index_doc = fetch_and_read_document(doc) logger.info(f"Adding doc {conversation.documents.index(doc)+1} of {len(conversation.documents)} to index") for d in llama_index_doc: d.metadata['patient_id'] = str(doc.patient_id) d.metadata['conversation_id'] = str(doc.conversation_id) d.excluded_llm_metadata_keys = ["page_label", "file_name", "db_document_id", "patient_id", "conversation_id"] logger.info(f"Inserting document {doc.id} into index") index.insert(d) llama_index_docs.append(d)
excluded_llm_metadata_keys
from fetch_and_read_document
to this section where I'm building the index, but this is also not working. Same result.metadata_mode=MetadataMode.EMBED
up in my service_context aboveCBEventType.LLM
event.if event_type != CBEventType.EMBEDDING and event_type != CBEventType.AGENT_STEP: logger.info(f"\n\nEvent type {event_type}") if payload is not None: logger.info(f"\nHas the following payload:\n\n{json.dumps(payload, default=custom_serializer)}")
********** Trace: chat |_CBEventType.AGENT_STEP -> 6.944333 seconds |_CBEventType.LLM -> 2.012103 seconds |_CBEventType.FUNCTION_CALL -> 4.133827 seconds |_CBEventType.QUERY -> 4.13283 seconds |_CBEventType.LLM -> 2.221637 seconds |_CBEventType.SUB_QUESTION -> 1.215887 seconds |_CBEventType.QUERY -> 1.214849 seconds |_CBEventType.RETRIEVE -> 0.350428 seconds // [1] Includes it (attached) |_CBEventType.EMBEDDING -> 0.217844 seconds |_CBEventType.SYNTHESIZE -> 0.86362 seconds |_CBEventType.TEMPLATING -> 0.000141 seconds // [2] Includes it (attached) |_CBEventType.LLM -> 0.842278 seconds // [3] Includes it (attached) |_CBEventType.SYNTHESIZE -> 0.660717 seconds |_CBEventType.TEMPLATING -> 0.000109 seconds |_CBEventType.LLM -> 0.657789 seconds |_CBEventType.LLM -> 0.0 seconds **********
llama_debug = LlamaDebugHandler(print_trace_on_end=True) callback_handlers.append(llama_debug) callback_manager = CallbackManager(callback_handlers) ... llama_logger = LlamaLogger() service_context = ServiceContext.from_defaults( callback_manager=callback_manager, llm=llm, embed_model=embedding_model, node_parser=node_parser, llama_logger=llama_logger, transformations=transformations, )
import llama_index llama_index.set_global_handler("simple")
>>> import llama_index >>> llama_index.set_global_handler("simple") >>> from llama_index import Document, VectorStoreIndex >>> document = Document(text='test', metadata={'file_name': 'fake.txt'}) >>> document.excluded_llm_metadata_keys = ['file_name'] >>> index = VectorStoreIndex.from_documents([document]) >>> query_engine = index.as_query_engine() >>> response = query_engine.query("What is the file name?") ** Messages: ** system: You are an expert Q&A system that is trusted around the world. Always answer the query using the provided context information, and not prior knowledge. Some rules to follow: 1. Never directly reference the given context in your answer. 2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines. user: Context information is below. --------------------- test --------------------- Given the context information and not prior knowledge, answer the query. Query: What is the file name? Answer: ************************************************** ** Response: ** assistant: The file name is "test". ************************************************** >>>
document = Document(text='test', metadata={'file_name': 'fake.txt'})
like you / the Docs dofetch_and_read_document
function returns List[LlamaIndexDocument]
which is a list of Document objectsfrom llama_index.schema import Document as LlamaIndexDocument
fetch_and_read_document
, and iterate over each Document, VS Code lets me access the excluded_llm_metadata_keys
property so I'd expect it to work? Only other major difference is I'm creating the index first, and then inserting nodes afterwards:llama_index_docs = [] index = VectorStoreIndex.from_documents( [], storage_context=storage_context, service_context=service_context, show_progress=True, ) for doc in conversation.documents: try: llama_index_doc = fetch_and_read_document(doc) logger.info(f"Adding doc {conversation.documents.index(doc)+1} of {len(conversation.documents)} to index") for d in llama_index_doc: d.metadata['patient_id'] = str(doc.patient_id) d.metadata['conversation_id'] = str(doc.conversation_id) d.excluded_llm_metadata_keys = ["page_label", "file_name", "db_document_id", "patient_id", "conversation_id"] logger.info(f"Inserting document {doc.id} into index") index.insert(d) llama_index_docs.append(d)
>>> import llama_index >>> llama_index.set_global_handler("simple") >>> from llama_index import SimpleDirectoryReader, VectorStoreIndex >>> document = SimpleDirectoryReader("./docs/examples/data/paul_graham").load_data()[0] >>> document.metadata {'file_path': 'docs/examples/data/paul_graham/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2023-10-04', 'last_modified_date': '2023-10-04', 'last_accessed_date': '2023-12-13'} >>> document.excluded_llm_metadata_keys = list(document.metadata.keys()) >>> index = VectorStoreIndex.from_documents([document]) >>> query_engine = index.as_query_engine() >>> response = query_engine.query("What is the file name?")
query_engine.query()
-- just like my sample codefetch_and_read_document
and it successfully added metadata fields to the excluded_llm_metadata_keys
property. I also deleted all of my nodes in my local DB, so there's only 6 documents and 6 nodes in total.from llama_index.ingestion import IngestionPipeline pipeline = IngestionPipeline(transformations=[...]) nodes = pipeline.run(documents=documents) for node in nodes: node.excluded_llm_metadata_keys = [...] index = VectorStoreIndex(nodes, service_context=service_context, storage_context=storage_context)
transformations = [ node_parser, summary_extractor, ... ] pipeline = IngestionPipeline(transformations=transformations) for doc in conversation.documents: try: llama_index_doc = fetch_and_read_document(doc) logger.info(f"Adding doc {conversation.documents.index(doc)+1} of {len(conversation.documents)} to index") nodes = pipeline.run( documents=llama_index_doc, in_place=True, show_progress=True, ) for node in nodes: node.metadata['patient_id'] = str(doc.patient_id) node.metadata['conversation_id'] = str(doc.conversation_id) node.excluded_llm_metadata_keys = ["page_label", ..."] index.insert_nodes(nodes) llama_index_docs.append(node)