am i blind or is there no way to specify the grammar path (for json) when using LLamaCPP?:
class Llama:
def __init__(
n_gpu_layers: int = 0,
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
main_gpu: int = 0,
tensor_split: Optional[List[float]] = None,
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
kv_overrides: Optional[Dict[str, Union[bool, int, float, str]]] = None,
# Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
n_batch: int = 512,
n_threads: Optional[int] = None,
n_threads_batch: Optional[int] = None,
rope_scaling_type: Optional[int] = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
pooling_type: int = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED,
rope_freq_base: float = 0.0,
rope_freq_scale: float = 0.0,
yarn_ext_factor: float = -1.0,
yarn_attn_factor: float = 1.0,
yarn_beta_fast: float = 32.0,
yarn_beta_slow: float = 1.0,
yarn_orig_ctx: int = 0,
logits_all: bool = False,
embedding: bool = False,
offload_kqv: bool = True,
flash_attn: bool = False,
# Sampling Params
last_n_tokens_size: int = 64,
# LoRA Params
lora_base: Optional[str] = None,
lora_scale: float = 1.0,
lora_path: Optional[str] = None,
# Backend Params
numa: Union[bool, int] = False,
# Chat Format Params
chat_format: Optional[str] = None,
chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
# Speculative Decoding
draft_model: Optional[LlamaDraftModel] = None,
# Tokenizer Override
tokenizer: Optional[BaseLlamaTokenizer] = None,
):