My code is as follows, while the doc giving me the error has a text of len 63618
title_extractor = TitleExtractor(llm=llm,num_workers=8)
qa_extractor = QuestionsAnsweredExtractor(llm=llm,questions=3,num_workers=8)
summary_extractor = SummaryExtractor(summaries=["prev","self","next"],llm=llm,num_workers=8)
keyword_extractor = KeywordExtractor(llm=llm,num_workers=8)
sentence_splitter = SentenceSplitter(chunk_size=2048,chunk_overlap=512)
huggingface_embedding = HuggingFaceEmbedding(model_name="../../huggingface_models/bge-large-en-v1.5/")
documents = []
for root,folders,files in os.walk("./cleaned_json/"):
for file in files:
filepath = f"{root}/{file}"
file_doc = JSONReader(levels_back=0).load_data(input_file=filepath)
documents.extend(file_doc)
pipeline = IngestionPipeline(
transformations=[
title_extractor,
qa_extractor,
summary_extractor,
keyword_extractor,
sentence_splitter,
huggingface_embedding
],
vector_store=vector_store
)
pipeline.run(documents=documents,show_progress=True,cache_collection="./pipeline_storage")
pipeline.persist("./pipeline_storage_persist")