embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
MetadataExtractor,
SummaryExtractor,
QuestionsAnsweredExtractor,
TitleExtractor,
KeywordExtractor,
MetadataFeatureExtractor,
)
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)
class CustomExtractor(MetadataFeatureExtractor):
def extract(self, nodes):
metadata_list = [
{
"custom": node.metadata["document_title"]
+ "\n"
+ node.metadata["excerpt_keywords"]
}
for node in nodes
]
return metadata_list
metadata_extractor = MetadataExtractor(
extractors=[
TitleExtractor(nodes=5),
QuestionsAnsweredExtractor(questions=3),
# SummaryExtractor(summaries=["prev", "self"]),
# KeywordExtractor(keywords=10),
# CustomExtractor()
],
)
node_parser = SimpleNodeParser(
text_splitter=text_splitter,
metadata_extractor=metadata_extractor,
)
service_context_embed = ServiceContext.from_defaults(embed_model=embed_model)
from llama_index import SimpleDirectoryReader, DocumentSummaryIndex
itmg_docs = SimpleDirectoryReader(input_files=["./Laporan Tahunan ITMG 2022.pdf"]).load_data()
itmg_front_pages = itmg_docs[0:67]
itmg_content = itmg_docs[68:69]
itmg_docs=itmg_front_pages+itmg_content
itmg_nodes= node_parser.get_nodes_from_documents(itmg_docs, service_context=service_context_embed)
print (f" Finished building node with {len(itmg_nodes)} nodes")