or this
from llama_index.llms import Replicate
from llama_index import ServiceContext, set_global_service_context
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
# The replicate endpoint
LLAMA_13B_V2_CHAT = "a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5"
# inject custom system prompt into llama-2
def custom_completion_to_prompt(completion: str) -> str:
return completion_to_prompt(
completion,
system_prompt=(
"You are a Q&A assistant. Your goal is to answer questions as "
"accurately as possible is the instructions and context provided."
),
)
llm = Replicate(
model=LLAMA_13B_V2_CHAT,
temperature=0.01,
# override max tokens since it's interpreted
# as context window instead of max tokens
context_window=4096,
# override completion representation for llama 2
completion_to_prompt=custom_completion_to_prompt,
# if using llama 2 for data agents, also override the message representation
messages_to_prompt=messages_to_prompt,
)
# set a global service context
ctx = ServiceContext.from_defaults(llm=llm)
set_global_service_context(ctx)