line numbers are defined here:
line_number = 1
# Create Document objects with extended metadata
documents = []
for doc_idx, (chunk, page_number) in enumerate(chunks): # Note the unpacking of (chunk, page_number)
print(f"Debug: Creating Document Object {doc_idx + 1} for Page {page_number}") # Debugging
line_count_in_chunk = chunk.count('\n') + 1
# Debug print statements
print(f"Debug: Chunk {doc_idx}, Page {page_number}, Line Count {line_count_in_chunk}")
print("Debug: Chunk Content:", chunk[:50]) # Print first 50 characters of the chunk
metadata = {
"source_doc_idx": doc_idx,
"filename": "1.2.2.2 Customer Contract - Stockwood Dr - Woodstock - GA.pdf",
"page_number": page_number, # Using the actual page number
"document_title": "Customer Contract - Stockwood Dr - Woodstock - GA",
"line_count": line_count_in_chunk,
"starting_line_number": line_number
}
print(f"Debug: Adding Metadata for Document Object {doc_idx + 1} for Page {page_number}") # Debugging
print(f"Debug: Metadata: {metadata}") # Debugging
documents.append(Document(text=chunk, metadata=metadata))
# Update the line_number for the next chunk
line_number += line_count_in_chunk # Moved inside the loop
# New Debug Statements
for doc in documents:
print(f"Document Metadata: {doc.metadata}")