So I guess the real question is: why does the text_splitter behave this way, and how can I index metadata in my nodes so that it is searchable but not a part of this 'segmentation' process? This works like this:
for idx,raw_document in enumerate(documents):
turns = []
raw_document = raw_document['raw_doc']
...
metadata_core = {k:v for k,v in raw_document.items() if not '__' in k}
excluded_keys = list(metadata_core.keys())
document = Document(
text=conversation,
metadata_seperator="::",
metadata_template="{key}=>{value}",
text_template="Metadata: {metadata_str}\n-----\nContent: {content}"
)
formatted_documents.append(document)
# process document by document so correct metadata
# remains associated with the nodes
raw_nodes = node_parser.get_nodes_from_documents(
[document]
)
# now add the custom metadata
for node in raw_nodes:
node.metadata.update(metadata_core)
node.excluded_llm_metadata_keys = excluded_keys
node.excluded_embed_metadata_keys = excluded_keys
formatted_nodes.append(node)