Find answers from the community

Updated last year

how to combine a metadata extractor with

how to combine a metadata_extractor with hungging face llm for embeeding and summary extractor process ?
i run this but not work well ..

embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")) from llama_index.node_parser import SimpleNodeParser from llama_index.node_parser.extractors import ( MetadataExtractor, SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor, MetadataFeatureExtractor, ) from llama_index.langchain_helpers.text_splitter import TokenTextSplitter text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128) class CustomExtractor(MetadataFeatureExtractor): def extract(self, nodes): metadata_list = [ { "custom": node.metadata["document_title"] + "\n" + node.metadata["excerpt_keywords"] } for node in nodes ] return metadata_list metadata_extractor = MetadataExtractor( extractors=[ TitleExtractor(nodes=5), QuestionsAnsweredExtractor(questions=3), # SummaryExtractor(summaries=["prev", "self"]), # KeywordExtractor(keywords=10), # CustomExtractor() ], ) node_parser = SimpleNodeParser( text_splitter=text_splitter, metadata_extractor=metadata_extractor, ) service_context_embed = ServiceContext.from_defaults(embed_model=embed_model) from llama_index import SimpleDirectoryReader, DocumentSummaryIndex itmg_docs = SimpleDirectoryReader(input_files=["./Laporan Tahunan ITMG 2022.pdf"]).load_data() itmg_front_pages = itmg_docs[0:67] itmg_content = itmg_docs[68:69] itmg_docs=itmg_front_pages+itmg_content

itmg_nodes= node_parser.get_nodes_from_documents(itmg_docs, service_context=service_context_embed) print (f" Finished building node with {len(itmg_nodes)} nodes")
L
1 comment
Right now you've only set the embed_model. So the LLM is defaulting to text-davinci-003 πŸ€”
Add a reply
Sign up and join the conversation on Discord