@Logan M
# Create ingestion pipeline
def create_ingestion_pipeline() -> IngestionPipeline:
# Node parser
node_parser = SentenceSplitter(
separator=" ",
chunk_size=1024,
chunk_overlap=200,
)
worker_llm = AzureOpenAI(
temperature=0.1,
model="gpt-3.5-turbo",
max_tokens=512,
engine=AZURE_OPENAI_API_DEPLOYMENT_NAME_GPT_4,
azure_endpoint=AZURE_OPENAI_API_ENDPOINT,
api_key=AZURE_OPENAI_API_KEY,
api_version=AZURE_OPENAI_API_VERSION,
)
title_extractor = TitleExtractor(llm=worker_llm, num_workers=4)
pipeline = IngestionPipeline(
transformations=[
node_parser,
title_extractor,
]
)
return pipeline
def create_document_from_s3(s3_url: str) -> List[Document]:
"""Creates a document from an S3 URL."""
# FIXME: will need to handle cruft
# we have to download locally based on available research of data loading in llama-index
file_path = download_file(s3_url)
loader = UnstructuredReader()
document = loader.load_data(file_path)
delete_local_file(file_path)
return document
main functionality
pipeline = create_ingestion_pipeline()
documents = create_document_from_s3(s3_url)
nodes = pipeline.run(document=documents)
vector_store = get_pinecone_vector_store()
vector_store.add(nodes=nodes)
value of documents when printed
[Document(id_='6c1580c3-7dc2-47b5-9909-9b0a60a8dfa2', embedding=None, metadata={'filename': '/tmp/llama-indexer/files/91301739-786a-45b3-b387-f5a7a7c35992-1689966736444-864818-sammyisagamer2.txt', 'user_id': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='b10a29d59e236cfb927b37b012eeb2bf79e97898e16f7d4243947225baeb0bdd', text='Sammy roberts is a gamer', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]