def get_text_splits_from_document(
document: BaseNode,
text_splitter: TextSplitter,
include_metadata: bool = True,
) -> List[TextSplit]:
"""Break the document into chunks with additional info."""
# TODO: clean up since this only exists due to the diff w LangChain's TextSplitter
if isinstance(text_splitter, TokenTextSplitter):
# use this to extract extra information about the chunks
text_splits = text_splitter.split_text_with_overlaps(
document.get_content(metadata_mode=MetadataMode.NONE),
metadata_str=document.get_metadata_str() if include_metadata else None,
)
else:
text_chunks = text_splitter.split_text(
document.get_content(),
)
text_splits = [TextSplit(text_chunk=text_chunk) for text_chunk in text_chunks]
return text_splits
this logic is no good, if you actually implement it where you return a [str], it will work, but if you implement your textSplitter to return [TextSplit]'s so that each split can potentially contain metadata, like what the header stack was for this chunk of text, you can't.
Id' probably modify
text_chunks = text_splitter.split_text(
document.get_content(),
)
text_splits = [TextSplit(text_chunk=text_chunk) for text_chunk in text_chunks]
so that it doesn't assume it's a string, but actually attempts to handle strings, documents or TextSplits being returned