I implemented LlamaParse like this, but for some reason it always reparses the document. I would have expected the document to only be parsed once? @Logan M Can you maybe tell me what i am doing wrong here? It tries to reparse even before the 48h breakpoint.
def get_file_documents(config: FileLoaderConfig):
parser = llama_parse_parser()
files_info = fetch_file_list()
logger.info(
f"List of files ready for download. Number of files to download: {len(files_info)}"
)
if config.use_llama_parse:
file_paths = []
for file_info in files_info:
resource_url = file_info["resourceURL"]
file_name = file_info["fileName"]
file_path = os.path.join(config.data_dir, file_name)
if not os.path.exists(file_path):
download_file(resource_url, file_path)
logger.info(
f"Successfully downloaded file: {file_name} and saved it on the server."
)
file_paths.append(file_path)
documents = []
for file_number, file_path in enumerate(file_paths, 1):
file_name = os.path.basename(file_path)
json_representation = parser.get_json_result(file_path)
document = parser.load_data(
file_path=file_path,
extra_info={
"file_name": file_name,
"file_number": file_number,
"pages": json_representation[0]["pages"]
}
)
documents.append(
document
)