The problem is that the SimpleDirectoryReader and all PDF data loaders on LlamaHub ignore non-text elements. To resolve this issue, I had to implement my own PDFReader as such
python def load_data(self, file, extra_info=None):
doc = fitz.open(file)
text = ""
for page in doc:
links = page.get_links()
logger.error(f"Links: {links}")
# Crawl all links on the page and insert them as text + hyperlink at the correct position
for link in links:
x = 15
link_text = page.get_textbox(
link["from"] + (-x, -x, x, x)
)
link_rect = link["from"]
annotation_and_link = f"[Link]: {link['uri']}"
annotation_and_link = f"[{link_text}]: {link['uri']}"
page.insert_text(
(link_rect[0], link_rect[2]),
annotation_and_link,
)
logger.error(f"hyperlink found: {annotation_and_link}")
text += page.get_text()
doc.close()
return [
Document(
text=text,
extra_info=extra_info,
)
]