Can you guess why ？

Hi,
Thank you so much for your great work.
I played with your sample code. Model is loaded on GPU and works without any problem.
```python
# Import necessary libraries
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Qwen3VLChatHandler
import base64
import os

# --- Model Configuration ---
# Define the path to the main model file
MODEL_PATH = r"./Qwen3VL-8B-Instruct-Q8_0.gguf"
# Define the path to the multi-modal projector file
MMPROJ_PATH = r"./mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf"

# --- Initialize the Llama Model ---
llm = Llama(
    model_path=MODEL_PATH,
    # Set up the chat handler for Qwen3-VL, specifying the projector path
    chat_handler=Qwen3VLChatHandler(
      clip_model_path=MMPROJ_PATH,
      #force_reasoning=True,
      verbose=False,
      image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
    ),
    n_gpu_layers=-1,  # Offload all layers to the GPU
    n_ctx=20480,      # Set the context window size
    swa_full=True,
    verbose=True, #False,
)

import atexit
@atexit.register
def free_model():
    llm._sampler.close()
    llm.close()

# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
    # Most common formats
    '.png':  'image/png',
    '.jpg':  'image/jpeg',
    '.jpeg': 'image/jpeg',
    '.gif':  'image/gif',
    '.webp': 'image/webp',

    # Next-generation formats
    '.avif': 'image/avif',
    '.jp2':  'image/jp2',
    '.j2k':  'image/jp2',
    '.jpx':  'image/jp2',

    # Legacy / Windows formats
    '.bmp':  'image/bmp',
    '.ico':  'image/x-icon',
    '.pcx':  'image/x-pcx',
    '.tga':  'image/x-tga',
    '.icns': 'image/icns',

    # Professional / Scientific imaging
    '.tif':  'image/tiff',
    '.tiff': 'image/tiff',
    '.eps':  'application/postscript',
    '.dds':  'image/vnd-ms.dds',
    '.dib':  'image/dib',
    '.sgi':  'image/sgi',

    # Portable Map formats (PPM/PGM/PBM)
    '.pbm':  'image/x-portable-bitmap',
    '.pgm':  'image/x-portable-graymap',
    '.ppm':  'image/x-portable-pixmap',

    # Miscellaneous / Older formats
    '.xbm':  'image/x-xbitmap',
    '.mpo':  'image/mpo',
    '.msp':  'image/msp',
    '.im':   'image/x-pillow-im',
    '.qoi':  'image/qoi',
}

def image_to_base64_data_uri(
    file_path: str,
    *,
    fallback_mime: str = "application/octet-stream"
) -> str:
    """
    Convert a local image file to a base64-encoded data URI with the correct MIME type.

    Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.).

    Args:
        file_path: Path to the image file on disk.
        fallback_mime: MIME type used when the file extension is unknown.

    Returns:
        A valid data URI string (e.g., data:image/webp;base64,...).

    Raises:
        FileNotFoundError: If the file does not exist.
        OSError: If reading the file fails.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"Image file not found: {file_path}")

    extension = os.path.splitext(file_path)[1].lower()
    mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)

    if mime_type == fallback_mime:
        print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
              f"Using fallback MIME type: {fallback_mime}")

    try:
        with open(file_path, "rb") as img_file:
            encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
    except OSError as e:
        raise OSError(f"Failed to read image file '{file_path}': {e}") from e

    return f"data:{mime_type};base64,{encoded_data}"

# --- Main Logic for Image Processing ---

# 1. Create a list containing all image paths
image_paths = [
    r"c:/document_root/vlmrag/images/0000_ksisj.jpeg",
    r'c:/document_root/vlmrag/images/0007_kjeoib.jpeg',
    r'c:/document_root/vlmrag/images/0001_poiqwnd.jpeg',
    r'c:/document_root/vlmrag/images/0002_hhsgcf.jpeg',
    r'c:/document_root/vlmrag/images/0010_wdvwd.jpeg',
    # Add more image paths here if needed
]

# 2. Create an empty list to store the message objects (images and text)
images_messages = []

# 3. Loop through the image path list, convert each image to a Data URI,
#    and add it to the message list as an image_url object.
for path in image_paths:
    data_uri = image_to_base64_data_uri(path)
    images_messages.append({"type": "image_url", "image_url": {"url": data_uri}})

# 4. Add the final text prompt at the end of the list
images_messages.append({"type": "text", "text": "GGUF形式のファイルは使えますか？。"})

# 5. Use this list to build the chat_completion request
res = llm.create_chat_completion(
    messages=[
        #{"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
        {"role": "system", "content": "あなたはAI博士です。"},
        # The user's content is the list containing both images and text
        {"role": "user", "content": images_messages}
    ]
)

# Print the assistant's response
print(res["choices"][0]["message"]["content"])
```

However if same code is assembled into my WEB-UI code, model is never loaded on GPU.
```python
import os, glob, sys, pickle, codecs
import time
from typing import List, Dict, Any, Optional

import torch
import numpy as np
from PIL import Image

#from transformers.utils.import_utils import is_flash_attn_2_available
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
from transformers import LogitsProcessor, LogitsProcessorList

from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, VectorParams, PointStruct,
    MultiVectorConfig, MultiVectorComparator
)
import gradio as gr
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Qwen3VLChatHandler
import base64

os.environ["NO_PROXY"] = "localhost, 127.0.0.1/8, ::1" 

MODEL_PATH = r"./Qwen3VL-8B-Instruct-Q8_0.gguf"
MMPROJ_PATH = r"./mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf"
SRV_IP = '10.xx.xxx.27' # Gradio IP
SRV_PT = 8080 # Gradio port
URL = '127.0.0.1:6333' # IP & port for Qdrant server 

# グローバルスコープにすべく先に宣言
SETTING_VISIBLE = True
DOCUMENT＿ROOT = 'c:/document_root' 
pdf_paths = []
img_path = None
collection_name = "" 
all_collections = [] 
paths = [] 
KN = 5 

print(f"Loading VLM model: {MODEL_PATH}")
t0 = time.perf_counter()
# guff形式のVLMモデル本体をロード（生成用）
llm = Llama(
    model_path=MODEL_PATH,
    # Set up the chat handler for Qwen3-VL, specifying the projector path
    chat_handler=Qwen3VLChatHandler(
      clip_model_path=MMPROJ_PATH,
      #force_reasoning=True,
      verbose=False,
      image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
    ),
    n_gpu_layer=-1,
    n_ctx=15360, #1000,
    swa_full=True,
    verbose=True, #False,
)

import atexit
@atexit.register
def free_model():
    llm._sampler.close()
    llm.close()

print(f"VLM loaded in {time.perf_counter() - t0:.2f}s")

#=====================
# 画像をbase64で変換する際に必要なファイル形式に関する定数
# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
    # Most common formats
    '.png':  'image/png',
    '.jpg':  'image/jpeg',
    '.jpeg': 'image/jpeg',
    '.gif':  'image/gif',
    '.webp': 'image/webp',

    # Next-generation formats
    '.avif': 'image/avif',
    '.jp2':  'image/jp2',
    '.j2k':  'image/jp2',
    '.jpx':  'image/jp2',

    # Legacy / Windows formats
    '.bmp':  'image/bmp',
    '.ico':  'image/x-icon',
    '.pcx':  'image/x-pcx',
    '.tga':  'image/x-tga',
    '.icns': 'image/icns',

    # Professional / Scientific imaging
    '.tif':  'image/tiff',
    '.tiff': 'image/tiff',
    '.eps':  'application/postscript',
    '.dds':  'image/vnd-ms.dds',
    '.dib':  'image/dib',
    '.sgi':  'image/sgi',

    # Portable Map formats (PPM/PGM/PBM)
    '.pbm':  'image/x-portable-bitmap',
    '.pgm':  'image/x-portable-graymap',
    '.ppm':  'image/x-portable-pixmap',

    # Miscellaneous / Older formats
    '.xbm':  'image/x-xbitmap',
    '.mpo':  'image/mpo',
    '.msp':  'image/msp',
    '.im':   'image/x-pillow-im',
    '.qoi':  'image/qoi',
}

def image_to_base64_data_uri(
    file_path: str,
    *,
    fallback_mime: str = "application/octet-stream"
) -> str:
    """
    Convert a local image file to a base64-encoded data URI with the correct MIME type.

    Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.).

    Args:
        file_path: Path to the image file on disk.
        fallback_mime: MIME type used when the file extension is unknown.

    Returns:
        A valid data URI string (e.g., data:image/webp;base64,...).

    Raises:
        FileNotFoundError: If the file does not exist.
        OSError: If reading the file fails.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"Image file not found: {file_path}")

    extension = os.path.splitext(file_path)[1].lower()
    mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)

    if mime_type == fallback_mime:
        print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
              f"Using fallback MIME type: {fallback_mime}")

    try:
        with open(file_path, "rb") as img_file:
            encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
    except OSError as e:
        raise OSError(f"Failed to read image file '{file_path}': {e}") from e

    return f"data:{mime_type};base64,{encoded_data}"

class PresencePenaltyProcessor(LogitsProcessor):
    """
    Apply a presence penalty: discourage generating tokens that have already appeared
    in the generated sequence (not frequency-based, but presence-based).
    This mimics OpenAI-style presence_penalty in a simple way by subtracting a fixed
    penalty from logits of any token present at least once in the generated tokens.
    """
    def __init__(self, presence_penalty: float):
        super().__init__()
        if presence_penalty < 0:
            raise ValueError("presence_penalty must be >= 0.")
        self.presence_penalty = presence_penalty

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # input_ids shape: (batch, cur_len)
        # scores shape: (batch, vocab_size)
        batch_size = input_ids.shape[0]
        for b in range(batch_size):
            seen = set(input_ids[b].tolist())
            if len(seen) == 0:
                continue
            # Subtract penalty from logits of seen tokens
            # Note: scores[b] is (vocab_size,)
            # Efficient masking
            indices = torch.tensor(list(seen), device=scores.device, dtype=torch.long)
            # Clamp indices to valid range just in case
            indices = indices[(indices >= 0) & (indices < scores.shape[-1])]
            if indices.numel() > 0:
                scores[b, indices] -= self.presence_penalty
        return scores

# ---------- Qdrant----------
class QdrantVectorStore:
    def __init__(
        self,
        collection_name: str = collection_name,
        timeout: float = 120.0,
        url=URL,
    ):
#.................................code continues.................................
```

As you can tell, transformers, colpali, Gradio and Qdrant are used in this RAG WEB-UI system.
It takes time,  but code itself works OK and gives reasonable answer with CPU.
Can you guess why GGUF model never loaded on GPU with 2nd code ?

environment:
Windows11 pro 24H2
GPU NVIDIA RTX3090
python 3.11
torch 2.5.1+cu124
llama_cpp_python 0.3.23

Thanks in advance.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Can you guess why ？ #62

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Can you guess why ？ #62

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions