def parse_pdf(pdf_file, output_file):
parsing_instructions = """
The provided pdf document is a multi page scientific article.
The following instructions should be followed:
- Include the journal title, journal name, and authors ONLY at the beginning of the document as they appear on the first page.
- Exclude repeated occurrences of the journal title, journal name, and authors on subsequent pages (e.g., in headers or footers).
- Preserve the logical flow of the document's main content without splitting paragraphs: focus on maintaining text continuity and readability.
- Exclude non-essential elements such as: page titles, page number, headers and footers.
- Do not return figures, tables, acknowledgments, funding information and references
- Do not return any non-ASCII or control characters, publisher details, download information, copyright indications.
- I repeat, do not return References or Bibliography sections, as they are not part of the main content."""
parser = LlamaParse(result_type="markdown", parsing_instruction=parsing_instructions,language="en")
parser = LlamaParse(result_type="markdown",verbose=True,language="en")
md_data = None
md_data = SimpleDirectoryReader(
input_files=[pdf_file],
required_exts=[".pdf"],
encoding="utf-8",
file_extractor={".pdf": parser}
).load_data()
# Check if md_data is empty or None
if not md_data:
print(f"Error: No data returned when parsing the file '{pdf_file}'. Skipping this file.")
return None
# Proceed to save the data only if parsing was successful
with open(output_file, 'wb') as f:
pickle.dump(md_data, f)
return md_data