data_generator.generate_questions_from_nodes()
# V2 import os import json import torch from transformers import AutoTokenizer, StoppingCriteria, StoppingCriteriaList, logging from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig quantized_model_dir = os.path.join(llm_model_path, "TheBloke_WizardLM-30B-GPTQ") model_basename = "wizardlm-30b-GPTQ-4bit.act.order" use_triton = False tokenizer_config_path = os.path.join(quantized_model_dir, "tokenizer_config.json") # Load the tokenizer config as a dict with open(tokenizer_config_path, "r") as f: tokenizer_config = json.load(f) # Now initialize the tokenizer with the config tokenizer = AutoTokenizer.from_pretrained( quantized_model_dir, use_fast=True, return_token_type_ids=False, **tokenizer_config ) # Verify the start and stop tokens print(f"Start token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}") print(f"End token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}") model = AutoGPTQForCausalLM.from_quantized( quantized_model_dir, model_basename=model_basename, use_safetensors=True, trust_remote_code=False, device="cuda:0", use_triton=use_triton, quantize_config=None, ) # Note: check the prompt template is correct for this model. prompt = "Tell me about AI" print("\n\n*** Generate:") input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda() output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=50) print(tokenizer.decode(output[0], skip_special_tokens=True)) # Set the bos_token_id and eos_token_id model.config.bos_token_id = tokenizer.bos_token_id model.config.eos_token_id = tokenizer.eos_token_id # Prevent printing spurious transformers error when using pipeline with AutoGPTQ logging.set_verbosity(logging.CRITICAL)
# setup prompts from llama_index.prompts.prompts import SimpleInputPrompt system_prompt = """ A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n USER: {query_str}\n ASSISTANT: " """ # This will wrap the default prompts that are internal to llama-index query_wrapper_prompt = SimpleInputPrompt("USER: {query_str}\nASSISTANT: ")
service_context = ServiceContext.from_defaults( llm_predictor=hf_predictor, embed_model=embed_model )