node.extra_info
return None even when there's source text.>>> from llama_index import Document, GPTVectorStoreIndex >>> doc = Document("this is some text", extra_info={'test_key': 'test_val'}) >>> index = GPTVectorStoreIndex.from_documents([doc]) >>> response = index.as_query_engine().query('hello world') >>> response.source_nodes[0].node.extra_info {'test_key': 'test_val'} >>>
from llama_index import SimpleDirectoryReader filename_fn = lambda filename: {'file_name': filename} # automatically sets the extra_info of each document according to filename_fn documents = SimpleDirectoryReader('./data', file_metadata=filename_fn)
>>> from llama_index import SimpleDirectoryReader >>> filename_fn = lambda filename: {'file_name': filename} >>> documents = SimpleDirectoryReader('./paul_graham', file_metadata=filename_fn).load_data() >>> documents[0].extra_info {'file_name': 'paul_graham/paul_graham_essay.txt'} >>>
# Read in Documents filename_fn = lambda filename: {'file_name': filename} documents = [] print("Reading documents.") for file_path in file_dirs: documents.extend(SimpleDirectoryReader( input_dir=file_path, file_metadata=filename_fn, recursive=True).load_data() ) print("Building index.") index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
evaluator = ResponseEvaluator(service_context=service_context) response = query_engine.query(query) return { "query": query, "response": str(response), "source_documents": [x.node.extra_info for x in response.source_nodes], "source_text": self._source_text(response.source_nodes), "evaluation": evaluator.evaluate_source_nodes(response) }
[None, None, ...]
from llama_index import SimpleDirectoryReader, GPTVectorStoreIndex filename_fn = lambda filename: {'file_name': filename} documents = SimpleDirectoryReader( input_dir="./paul_graham", file_metadata=filename_fn, recursive=True).load_data() index = GPTVectorStoreIndex.from_documents(documents) response = index.as_query_engine().query("what did the author do growing up?") print(str(response)) print([x.node.extra_info for x in response.source_nodes])
Growing up, the author wrote short stories, programmed on an IBM 1401, built a microcomputer with a Heathkit, wrote simple games and a word processor on a TRS-80, and studied philosophy in college. [{'file_name': 'paul_graham/paul_graham_essay.txt'}, {'file_name': 'paul_graham/paul_graham_essay.txt'}]
index.storage_context.persist(persist_dir='./nodes_index') from llama_index import StorageContext, load_index_from_storage index = load_index_from_storage(StorageContext.from_defaults(persist_dir="./nodes_index"))
python -m venv venv source venv/bin/activate pip install llama-index
index_dir = os.path.join(self.indexes_dir, index_id) # Load index from requested docs storage_context = StorageContext.from_defaults(persist_dir=index_dir) service_context = self.create_service_context(**kwargs) index = load_index_from_storage( storage_context=storage_context, service_context=service_context, ) query_engine = index.as_query_engine() responses = [self._query(x, query_engine, service_context) for x in queries]
evaluator = ResponseEvaluator(service_context=service_context) response = query_engine.query(query) return { "query": query, "response": str(response), "source_documents": [x.node.extra_info for x in response.source_nodes], "source_text": self._source_text(response.source_nodes), "evaluation": evaluator.evaluate_source_nodes(response) }
nodes = index.docstore.docs)
it will get a list of every node in the index.None
Is there a method to look at the files to see if the data is there but not being ingested properly (vs it not being stored in the first place)?from_documents()
, are you 100% sure each document has an extra_info
field filled in?print("Printing documents...") pprint(documents) print("Building index.") index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) print("Printing Nodes") pprint(index.docstore.docs)
pip show llama-index
can check that>>> from llama_index import Document, GPTVectorStoreIndex >>> documents = [Document('text', extra_info={'test': 'val'})] >>> documents[0] Document(text='text', doc_id='03b4c6e9-2bd2-4687-8980-f388eeebd6d7', embedding=None, doc_hash='1d3f05b1647ad55d6c09b356fe5d1fe670be262d5c3ea0ccda070e365a94809b', extra_info={'test': 'val'}) >>> index = GPTVectorStoreIndex.from_documents(documents) >>> print(index.docstore.docs) {'faf195d4-1295-425b-acb9-4289dcbc1c33': Node(text='text', doc_id='faf195d4-1295-425b-acb9-4289dcbc1c33', embedding=None, doc_hash='1d3f05b1647ad55d6c09b356fe5d1fe670be262d5c3ea0ccda070e365a94809b', extra_info={'test': 'val'}, node_info={'start': 0, 'end': 4, '_node_type': <NodeType.TEXT: '1'>}, relationships={<DocumentRelationship.SOURCE: '1'>: '03b4c6e9-2bd2-4687-8980-f388eeebd6d7'})} >>>