self.extract()
if there's nothing async about your code# read documents
documents = SimpleDirectoryReader(input_files=["data/impact-of-large-language-models-in-business--09:10:2023.txt"]).load_data()
# define llm
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
# define text splitter
text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)
class CustomExtractor(BaseExtractor):
def extract(self, nodes):
metadata_list = [
{
"custom": (
node.metadata["document_title"]
+ "\n"
+ node.metadata["excerpt_keywords"]
)
}
for node in nodes
]
return metadata_list
extractors = [
TitleExtractor(nodes=5, llm=llm),
QuestionsAnsweredExtractor(questions=3, llm=llm),
SummaryExtractor(summaries=["prev", "self"], llm=llm),
KeywordExtractor(keywords=10, llm=llm),
CustomExtractor()
]
transformations = [text_splitter] + extractors
pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=documents, show_progress=True)
class CustomExtractor(BaseExtractor): async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]: """Extracts metadata for a sequence of nodes, returning a list of metadata dictionaries corresponding to each node. Args: nodes (Sequence[Document]): nodes to extract metadata from """ return self.extract(nodes) def extract(self, nodes): metadata_list = [ { "custom": ( node.metadata["document_title"] + "\n" + node.metadata["excerpt_keywords"] ) } for node in nodes ] return metadata_list