Hi all, I am trying to run a query from examples and it is taking more than 30 mins. I am using 4 A10 GPUs to load model
max_input_size = 1024
num_output = 64
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
llm_predictor = LLMPredictor(llm=CustomLLM())
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper, embed_model=embed_model)
documents = SimpleDirectoryReader('/home/ubuntu/llama_index/examples/paul_graham_essay/data').load_data()
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents)
index = GPTSimpleVectorIndex(nodes, service_context=service_context)
response = index.query("What did the author do growing up?", service_context=service_context, optimizer=SentenceEmbeddingOptimizer(percentile_cutoff=0.5))