documents = SimpleDirectoryReader(
"./data", file_metadata=metadatas
).load_data()
for doc in documents:
doc.doc_id = os.path.basename(doc.metadata["file_name"])
print(documents)
new_pipeline = IngestionPipeline(
transformations=[
TokenTextSplitter(chunk_size=5, chunk_overlap=0),
HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
], docstore=SimpleDocumentStore(),
vector_store=SimpleVectorStore()
)
nodes = new_pipeline.run(documents)
new_pipeline.persist("./persistence/pipeline")