token_counter = TokenCountingHandler( tokenizer=tiktoken.encoding_for_model(model_name).encode, verbose=False ) callback_manager = CallbackManager([token_counter]) service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size=project_chunk_size, callback_manager=callback_manager) index = VectorStoreIndex.from_vector_store(vector_store, service_context) retriever = index.as_retriever(verbose=True, chat_mode="context", similarity_top_k=similarity_top_k) custom_chat_engine = CustomContext.from_defaults( retriever=retriever, memory=chatmemory, context_template=generate_context_template(), system_prompt=prepared_system_prompt, node_postprocessors=[CustomPostprocessor( context_limit, query_text + prepared_system_prompt, project.db_name, None)]) response = custom_chat_engine.chat(query_text, chat_history=chat_history) tokens_used = token_counter.total_llm_token_count # <----- ALWAYS ZERO
class CustomContext(ContextChatEngine): def _get_prefix_messages_with_context(self, context_str: str) -> List[ChatMessage]: """Get the prefix messages with context.""" # ensure we grab the user-configured system prompt system_prompt = "" prefix_messages = self._prefix_messages if ( len(self._prefix_messages) != 0 and self._prefix_messages[0].role == MessageRole.SYSTEM ): system_prompt = str(self._prefix_messages[0].content) prefix_messages = self._prefix_messages[1:] context_str_w_sys_prompt = system_prompt.strip() + context_str # Opporsite order return [ ChatMessage(content=context_str_w_sys_prompt, role=MessageRole.SYSTEM), *prefix_messages, ]
custom_chat_engine = CustomContext.from_defaults(llm=service_context.llm, ...)
llm = service_context.llm_predictor.llm
from llama_index.callbacks import CallbackManager, TokenCountingHandler from llama_index.chat_engine import ContextChatEngine from llama_index.llms import OpenAI from llama_index import Document, ServiceContext, VectorStoreIndex import tiktoken token_counter = TokenCountingHandler( tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode, verbose=False ) callback_manager = CallbackManager([token_counter]) service_context = ServiceContext.from_defaults( llm=OpenAI(model="gpt-3.5-turbo"), chunk_size=512, callback_manager=callback_manager ) index = VectorStoreIndex.from_documents( [Document.example()], service_context=service_context ) chat_engine = index.as_chat_engine( verbose=True, chat_mode="context", similarity_top_k=2 ) response = chat_engine.chat("Tell me something about LLMs") print(token_counter.total_llm_token_count)
custom_chat_engine = CustomContext.from_defaults( retriever=retriever, memory=chatmemory, context_template=generate_context_template(), system_prompt=prepared_system_prompt, service_context=service_context, node_postprocessors=[CustomPostprocessor( context_limit, query_text + prepared_system_prompt, project.db_name, None)])