class MyJSONReader(BaseReader): def load_data(self, file, extra_info=None): with open(file, "r") as f: #text = f.read() json_data = json.load(f) for uuid, row in json_data.items():# in our case, only one json element per file #print (row) text = row["english_text"] extra_info["uuid"] = uuid extra_info["video_name"] = row["video_name"] extra_info["video_path"] = row["video_path"] extra_info["original_text"] = row["original_text"] extra_info["length_characters"] = row["length_characters"] extra_info["original_lang"] = row["original_lang"] extra_info["video_section"] = row["video_section"] print("extra_info2->" +str(extra_info)) return [Document(text=text, extra_info=extra_info , excluded_embed_metadata_keys=['uuid','video_name','file_path','original_text','length_characters','original_lang','video_section'] , excluded_llm_metadata_keys=['uuid','video_name','file_path','original_text','length_characters','original_lang','video_section']), ] reader = SimpleDirectoryReader( input_dir="/home/david/weaviate-tests/weaviate-videorack/4-llama-index-contained/sentences", recursive=True, file_extractor={".json": MyJSONReader()} ) documents = reader.load_data()
metadata = lambda extra_info:{"key":"value"} return [Document(text=text, metadata=metadata , excluded_embed_metadata_keys=['uuid','video_name','file_path','original_text','length_characters','original_lang','video_section'] , excluded_llm_metadata_keys=['uuid','video_name','file_path','original_text','length_characters','original_lang','video_section']), ]