Find answers from the community

Updated 3 months ago

am i blind or is there no way to specify

am i blind or is there no way to specify the grammar path (for json) when using LLamaCPP?:
Plain Text
class Llama:
    def __init__(
        n_gpu_layers: int = 0,
        split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
        main_gpu: int = 0,
        tensor_split: Optional[List[float]] = None,
        vocab_only: bool = False,
        use_mmap: bool = True,
        use_mlock: bool = False,
        kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
        # Context Params
        seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
        n_ctx: int = 512,
        n_batch: int = 512,
        n_threads: Optional[int] = None,
        n_threads_batch: Optional[int] = None,
        rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
        rope_freq_base: float = 0.0,
        rope_freq_scale: float = 0.0,
        yarn_ext_factor: float = -1.0,
        yarn_attn_factor: float = 1.0,
        yarn_beta_fast: float = 32.0,
        yarn_beta_slow: float = 1.0,
        yarn_orig_ctx: int = 0,
        logits_all: bool = False,
        embedding: bool = False,
        offload_kqv: bool = True,
        flash_attn: bool = False,
        # Sampling Params
        last_n_tokens_size: int = 64,
        # LoRA Params
        lora_base: Optional[str] = None,
        lora_scale: float = 1.0,
        lora_path: Optional[str] = None,
        # Backend Params
        numa: Union[bool, int] = False,
        # Chat Format Params
        chat_format: Optional[str] = None,
        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
        # Speculative Decoding
        draft_model: Optional[LlamaDraftModel] = None,
        # Tokenizer Override
        tokenizer: Optional[BaseLlamaTokenizer] = None,
    ):
D
L
6 comments
i want to use it like this:
Plain Text
llm = LlamaCPP(
    model_path=model_path,
    model_kwargs={
        "n_gpu_layers": 50,
        "grammar_path":"/home/_LLM/llama.cpp/grammars/json.gbnf",
        "repeat_penalty": 0.0},
    temperature=0.0,
    max_new_tokens=max_output_tokens,
    context_window=context_window,
    verbose=True
)
@Logan M @WhiteFang_Jr any idea? (sorry for pinging)
Does the grammer path go in the init or in the generation kwargs? I actually have no idea
llama.cpp is the worst to configure πŸ˜…
i have no idea... but this is how i configured it for langchain:
Plain Text
# llm = LlamaCpp(
#     model_path=model_path,
#     n_gpu_layers=35,
#     n_ctx=3900,
#     # n_ctx=8192,
#     temperature=0.0,
#     repeat_penalty=1.0,
#     use_mmap=False,  # keep model in ram if true (is default)
#     n_batch=512, 
#     # grammar_path="/home/_LLM/llama.cpp/grammars/json.gbnf",
# )
also the repeat_penalty would we very important for me
Add a reply
Sign up and join the conversation on Discord