I am using the following code to ingest a document into a vector store
def process_document(dbdir):
chroma_client = chromadb.PersistentClient(path=dbdir)
chroma_collection = chroma_client.get_or_create_collection("bitcoin")
vector_store = ChromaVectorStore(chroma_collection)
llm = OpenAI(model="gpt-4-0125-preview")
loader = PyMuPDFReader()
docs = loader.load_data(file_path=os.path.join(os.path.dirname(__file__), "..", "docs", "bitcoin.pdf"))
for doc in docs:
doc.id_ = hashlib.sha256(doc.text.encode('utf-8')).hexdigest()
click.echo(f"Loaded {len(docs)} documents")
embed_model = OpenAIEmbedding()
extractors = [
SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model),
TitleExtractor(nodes=5),
SummaryExtractor(summaries=["prev", "self", "next"]),
QuestionsAnsweredExtractor(questions=10, metadata=MetadataMode.EMBED),
KeywordExtractor(keywords=5),
embed_model
]
pipeline = IngestionPipeline(transformations=extractors, vector_store=vector_store, cache=IngestionCache())
processed_nodes = pipeline.run(documents=docs, show_progress=True, store_doc_text=True, store_doc_metadata=True)
click.echo(f"Processed {len(processed_nodes)} nodes")
How would i use refresh_ref_docs so that when i run the same document again it doesnb't create duplicate entries but updates the associated metadata and embeddings. I use the hash of the content to create my doc_id but whenever i try to add code that calls refresh i get the following error
An error occurred: 'TextNode' object has no attribute 'get_doc_id'
Can i do a refresh as part of my ingest pipeline