class MultimodalQueryEngine(CustomQueryEngine):
"""Custom multimodal Query Engine."""
def __init__(self, retriever, multi_modal_llm, qa_prompt=None):
super().__init__(retriever=retriever, qa_prompt=qa_prompt or QA_PROMPT)
# Explicitly define multi_modal_llm as a class attribute
self.multi_modal_llm = multi_modal_llm
def custom_query(self, query_str: str):
# Retrieve text nodes
nodes = self.retriever.retrieve(query_str)
# Create ImageNode items from text nodes
image_nodes = [
NodeWithScore(node=ImageNode(image_path=n.metadata["image_path"]))
for n in nodes
]
# Create context string from text nodes
context_str = "\n\n".join(
[n.node.get_content(metadata_mode=MetadataMode.LLM) for n in nodes]
)
fmt_prompt = self.qa_prompt.format(context_str=context_str, query_str=query_str)
# Synthesize an answer using AzureOpenAIMultiModal for image and text understanding
llm_response = self.multi_modal_llm.complete(
prompt=fmt_prompt,
image_documents=[image_node.node for image_node in image_nodes],
)
return Response(
response=str(llm_response),
source_nodes=nodes,
metadata={"text_nodes": nodes, "image_nodes": image_nodes},
)