from llama_cpp import Llama llm = Llama(model_path="/Users/developer/ai/models/openchat_3.5.Q8_0.gguf", n_gpu_layers=1, n_ctx=2048)
tokenizer_name="Writer/camel-5b-hf", model_name="Writer/camel-5b-hf",
import logging import sys from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.llms import HuggingFaceLLM from llama_index.prompts import PromptTemplate logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) documents = SimpleDirectoryReader("./data/paul_graham/").load_data() # This will wrap the default prompts that are internal to llama-index # taken from https://huggingface.co/Writer/camel-5b-hf query_wrapper_prompt = PromptTemplate( "Below is an instruction that describes a task. " "Write a response that appropriately completes the request.\n\n" "### Instruction:\n{query_str}\n\n### Response:" ) llm = HuggingFaceLLM( context_window=2048, max_new_tokens=256, generate_kwargs={"temperature": 0.25, "do_sample": False}, query_wrapper_prompt=query_wrapper_prompt, tokenizer_name="openchat/openchat_3.5", model_name="openchat/openchat_3.5", device_map="auto", tokenizer_kwargs={"max_length": 2048}, # uncomment this if using CUDA to reduce memory usage # model_kwargs={"torch_dtype": torch.float16} ) service_context = ServiceContext.from_defaults(chunk_size=512, llm=llm) index = VectorStoreIndex.from_documents( documents, service_context=service_context ) query_engine = index.as_query_engine() response = query_engine.query("What did the author do growing up?") print(response)
File "/Users/developer/Library/Caches/pypoetry/virtualenvs/playground-2AP3SaSf-py3.11/lib/python3.11/site-packages/transformers/modeling_utils.py", line 2674, in from_pretrained raise ImportError( ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install accelerate`