Find answers from the community

Updated last year

I am able to get the knowledge graph

I am able to get the knowledge graph page citations. but I can't get the document name to print:

line_number = 1
documents = []
for doc_idx, (chunk, page_number) in enumerate(chunks):
line_count_in_chunk = chunk.count('\n') + 1
metadata = {
"source_doc_idx": doc_idx,
"filename": "1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf",
"page_number": page_number,
"document_title": "Customer Contract - Stockwood Dr - Woodstock - GA",
"line_count": line_count_in_chunk,
"starting_line_number": line_number
}
documents.append(Document(text=chunk, metadata=metadata))
line_number += line_count_in_chunk

for doc in documents:
print(f"Document Metadata: {doc.metadata}")

service_context = ServiceContext.from_defaults(
llm=OpenAI(model="gpt-3.5-turbo", temperature=0)
)

file_path = "./1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf"
file_name = os.path.basename(file_path)

try:
index = VectorStoreIndex.from_documents(nodes, service_context=service_context)
except Exception as e:
print(f"Error: {e}")

query_engine = CitationQueryEngine.from_args(
index,
similarity_top_k=3,
citation_chunk_size=512,
)

response = query_engine.query("what is the purchase commitment?")
print("Query Response:", response)

G = nx.Graph()
for i, source_node in enumerate(response.source_nodes):
node_content = source_node.node.get_text()
citation_page = documents[i].metadata['page_number']
G.add_node((file_name, citation_page), content=node_content)
L
t
8 comments
the file name should be in the metadata no?

Plain Text
G = nx.Graph()
for source_node in response.source_nodes:
    node_content = source_node.node.get_text()
    citation_page = source_node.node.metadata['page_number']
    file_name = source_node.node.metadata['filename']
    G.add_node((file_name, citation_page), content=node_content)
I'm having another go at this:
First do I need to define this class:

class Document: def __init__(self, text, metadata): self.text = text self.metadata = metadata def get_content(self): return self.text def get_metadata(self): return self.metadata def get_metadata_str(self): return json.dumps(self.metadata) def get_doc_id(self): return self.metadata.get('doc_id', None) def hash(self): return hashlib.sha256(self.text.encode()).hexdigest()
and then once I do that I run this:
# Create Document objects with extended metadata documents = [] for doc_idx, (chunk, page_number) in enumerate(chunks): metadata = { "source_doc_idx": doc_idx, "filename": "1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf", "page_number": page_number, "document_title": "Customer Contract - Stockwood Dr - Woodstock - GA", "line_count": chunk.count('\n') + 1, "starting_line_number": doc_idx * 10 + 1 } documents.append(Document(text=chunk, metadata=metadata)) # Initialize VectorStoreIndex try: index = VectorStoreIndex.from_documents(documents, service_context=service_context) except Exception as e: print(f"Error: {e}") # Initialize the CitationQueryEngine query_engine = CitationQueryEngine.from_args( index, similarity_top_k=3, citation_chunk_size=512, ) # Query and Retrieve Information response = query_engine.query("who is party to the agreement") print("Query Response:", response) # Create Knowledge Graph Nodes G = nx.Graph() # Add nodes to the graph for i, source_node in enumerate(response.source_nodes): node_content = source_node.node.get_content() # Remove the metadata_mode keyword argument metadata = source_node.node.metadata citation = metadata.get('page_number', 'Unknown') file_name = metadata.get('filename', 'Unknown') title = metadata.get('document_title', 'Unknown') G.add_node(citation, content=node_content, title=title) # Nicely formatted metadata output print(f"--- Citation for Source Node {i + 1} ---") print(f"Filename: {metadata.get('filename', 'Unknown')}") print(f"Document Title: {metadata.get('document_title', 'Unknown')}") print(f"Page Number: {metadata.get('page_number', 'Unknown')}") print(f"Line Number: {metadata.get('starting_line_number', 'Unknown') + 2}") # Assuming line offset is 2
I get the correct answer:

but I get this annoying line at the very top:




Error: Document.get_content() got an unexpected keyword argument 'metadata_mode'
Query Response: The parties to the agreement are Redaptive Services XIV, LLC and AT&T Corp [3].
--- Citation for Source Node 1 ---
Filename: 1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf
Document Title: Order for Saved Utility Service and Site Information between Redaptive Services XIV, LLC and AT&T Corp.
Page Number: 3
Line Number: 68
--- Citation for Source Node 2 ---
Filename: 1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf
Document Title: Order for Saved Utility Service and Site Information between Redaptive Services XIV, LLC and AT&T Corp.
Page Number: 3
Line Number: 58
--- Citation for Source Node 3 ---
Filename: 1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf
Document Title: Order for Saved Utility Service and Site Information between Redaptive Services XIV, LLC and AT&T Corp.
Page Number: 1
Line Number: 3
Hi @Logan M πŸ™‚

any idea why I get this error?

Error: Document.get_content() got an unexpected keyword argument 'metadata_mode'
Shouldn't that be a lowercase? document.get_content() ?

That argument definitely exists on that method though
https://github.com/run-llama/llama_index/blob/06127ec09966e8df2fcd4f03a1b53ec566b4a43d/llama_index/schema.py#L157
I think I squashed that bug.
Add a reply
Sign up and join the conversation on Discord