@Logan M Thank you for the help. Override
_parse_nodes(...)
works for me.
Here is the code:
class DirectusSplitter(MetadataAwareTextSplitter):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@classmethod
def class_name(cls) -> str:
return "DirectusSplitter"
# Implement the abstract method to make python runtime happy
def split_text_metadata_aware(self, text: str, metadata_str: str) -> List[str]:
return []
# Implement the abstract method to make python runtime happy
def split_text(self, text: str) -> List[str]:
return []
def split_text_metadata_aware(self, text: str, metadata_str: str):
chunks = []
additional_metadatas = []
# ... split text into chunks and populate additional_metadatas
return chunks, additional_metadatas
def _parse_nodes(self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]:
all_nodes: List[BaseNode] = []
nodes_with_progress = get_tqdm_iterable(
nodes, show_progress, "Parsing nodes")
for node in nodes_with_progress:
metadata_str = self._get_metadata_str(node)
result = self.split_text_metadata_aware(
node.get_content(metadata_mode=MetadataMode.NONE),
metadata_str=metadata_str,
)
additional_metadatas = result[1]
nodes = build_nodes_from_splits(result[0], node)
for i in range(len(nodes)):
nodes[i].metadata.update(additional_metadatas[i])
all_nodes.extend(nodes)
return all_nodes