# Define metadata fields mapping metadata_fields = { "doc_id": ("doc_id", MetadataIndexFieldType.STRING), "page_num": ("page_num", MetadataIndexFieldType.INT64), "image_path": ("image_path", MetadataIndexFieldType.STRING), "parsed_text_markdown": ("parsed_text_markdown", MetadataIndexFieldType.STRING), "context": ("context", MetadataIndexFieldType.STRING), } # Initialize Azure AI Search vector store vector_store = AzureAISearchVectorStore( search_or_index_client=index_client, index_name="llamaindex-multimodal-contextual-retreival", index_management=IndexManagement.CREATE_IF_NOT_EXISTS, id_field_key="id", chunk_field_key="parsed_text_markdown", embedding_field_key="embedding", embedding_dimensionality=1536, # Based on embedding model metadata_string_field_key="metadata", # Stores all metadata as a JSON string doc_id_field_key="doc_id", filterable_metadata_field_keys=metadata_fields, language_analyzer="en.lucene", vector_algorithm_type="exhaustiveKnn", ) # Create storage context storage_context = StorageContext.from_defaults(vector_store=vector_store) # Build the index index = VectorStoreIndex.from_documents( new_text_nodes, storage_context=storage_context, llm=llm, embed_model=embed_model, )
import nest_asyncio from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor from llama_index.core.node_parser import TokenTextSplitter nest_asyncio.apply() # Configure text splitter text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128) # Load documents documents = SimpleDirectoryReader("data/pdf").load_data() storage_context = StorageContext.from_defaults(vector_store=vector_store) # Create index index = VectorStoreIndex.from_documents(documents, transformations=[text_splitter], storage_context=storage_context) ``
llama-index-readers-file
package not found# Property Graph Construction: Implicit Extraction Method from llama_index.core.indices.property_graph import ImplicitPathExtractor import nest_asyncio from llama_index.core import PropertyGraphIndex # Apply nest_asyncio to avoid runtime errors in async environments nest_asyncio.apply() # Initialize Azure AI Search vector store vector_store = AzureAISearchVectorStore( search_or_index_client=index_client, index_name=INDEX_NAME, index_management=IndexManagement.CREATE_IF_NOT_EXISTS, id_field_key="id", chunk_field_key="text", embedding_field_key="embedding", embedding_dimensionality=3072, # Adjust to match embedding model output (like ada-002) metadata_string_field_key="metadata", doc_id_field_key="doc_id", language_analyzer="en.lucene", vector_algorithm_type="exhaustiveKnn", compression_type="binary" ) # Construct the property graph index with implicit path extraction index = PropertyGraphIndex.from_documents( documents, llm=llm, embed_model=embed_model, vector_store=vector_store, show_progress=True, )
import nest_asyncio from llama_index.core import PropertyGraphIndex # Apply nest_asyncio to allow nested use of asyncio.run() nest_asyncio.apply() # Load documents and create index based on the use_existing_index flag if use_existing_index: storage_context = StorageContext.from_defaults(vector_store=vector_store) index = PropertyGraphIndex.from_documents([], storage_context=storage_context) else: # Load documents storage_context = StorageContext.from_defaults(vector_store=vector_store) # Create index index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
retriever = index.as_retriever( include_text=False, # include source text, default True ) nodes = retriever.retrieve("What happened at Interleaf and Viaweb?") for node in nodes: print(node.text)
llama-index-embeddings-azure-openai