diff --git a/gallery/index.yaml b/gallery/index.yaml index 941f041c67f0..c205e5999f84 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -1,4 +1,38 @@ --- +- name: "glm-4.7-flash-claude-4.5-opus-i1-mxfp4_moe_xl-exp" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + urls: + - https://huggingface.co/noctrex/GLM-4.7-Flash-Claude-4.5-Opus-i1-MXFP4_MOE_XL-exp-GGUF + description: | + **Model Description:** + This is an experimental quantized version of the GLM-4.7-Flash model, optimized with MXFP4 MOE for efficient deployment. It is derived from the `TeichAI/GLM-4.7-Flash-Claude-Opus-4.5-High-Reasoning-Distill` base model, trained for high-reasoning tasks like coding, science, and research. The model uses Unsloth for accelerated training and is designed for tasks requiring deep reasoning. It is still experimental, with no official benchmarks, but offers a lightweight, high-reasoning variant. + + **Key Features:** + - **Quantization:** MXFP4 MOE (4-bit) with dynamic precision (BF16/Q8_0) based on tensor importance. + - **Use Cases:** Coding, scientific research, and deep reasoning. + - **License:** Apache-2.0. + - **Optimized for:** Speed and efficiency with Unsloth. + + *Note: This is an experimental variant; results may vary, and benchmarks are pending.* + overrides: + parameters: + model: llama-cpp/models/imatrix.gguf + name: GLM-4.7-Flash-Claude-4.5-Opus-i1-MXFP4_MOE_XL-exp-GGUF + backend: llama-cpp + template: + use_tokenizer_template: true + known_usecases: + - chat + function: + grammar: + disable: true + description: Imported from https://huggingface.co/noctrex/GLM-4.7-Flash-Claude-4.5-Opus-i1-MXFP4_MOE_XL-exp-GGUF + options: + - use_jinja:true + files: + - filename: llama-cpp/models/imatrix.gguf + sha256: 3a27a5d7a01954883f9157b7ef1ca6123886ded8f6916dbd3f354ecd2cb670be + uri: https://huggingface.co/noctrex/GLM-4.7-Flash-Claude-4.5-Opus-i1-MXFP4_MOE_XL-exp-GGUF/resolve/main/imatrix.gguf - name: "glm-4.7-flash-derestricted" url: "github:mudler/LocalAI/gallery/virtual.yaml@master" urls: