forked from abetlen/llama-cpp-python
-
Notifications
You must be signed in to change notification settings - Fork 26
Open
Description
Hi,
Thank you so much for your great work.
I played with your sample code. Model is loaded on GPU and works without any problem.
# Import necessary libraries
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Qwen3VLChatHandler
import base64
import os
# --- Model Configuration ---
# Define the path to the main model file
MODEL_PATH = r"./Qwen3VL-8B-Instruct-Q8_0.gguf"
# Define the path to the multi-modal projector file
MMPROJ_PATH = r"./mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf"
# --- Initialize the Llama Model ---
llm = Llama(
model_path=MODEL_PATH,
# Set up the chat handler for Qwen3-VL, specifying the projector path
chat_handler=Qwen3VLChatHandler(
clip_model_path=MMPROJ_PATH,
#force_reasoning=True,
verbose=False,
image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
),
n_gpu_layers=-1, # Offload all layers to the GPU
n_ctx=20480, # Set the context window size
swa_full=True,
verbose=True, #False,
)
import atexit
@atexit.register
def free_model():
llm._sampler.close()
llm.close()
# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
# Most common formats
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp',
# Next-generation formats
'.avif': 'image/avif',
'.jp2': 'image/jp2',
'.j2k': 'image/jp2',
'.jpx': 'image/jp2',
# Legacy / Windows formats
'.bmp': 'image/bmp',
'.ico': 'image/x-icon',
'.pcx': 'image/x-pcx',
'.tga': 'image/x-tga',
'.icns': 'image/icns',
# Professional / Scientific imaging
'.tif': 'image/tiff',
'.tiff': 'image/tiff',
'.eps': 'application/postscript',
'.dds': 'image/vnd-ms.dds',
'.dib': 'image/dib',
'.sgi': 'image/sgi',
# Portable Map formats (PPM/PGM/PBM)
'.pbm': 'image/x-portable-bitmap',
'.pgm': 'image/x-portable-graymap',
'.ppm': 'image/x-portable-pixmap',
# Miscellaneous / Older formats
'.xbm': 'image/x-xbitmap',
'.mpo': 'image/mpo',
'.msp': 'image/msp',
'.im': 'image/x-pillow-im',
'.qoi': 'image/qoi',
}
def image_to_base64_data_uri(
file_path: str,
*,
fallback_mime: str = "application/octet-stream"
) -> str:
"""
Convert a local image file to a base64-encoded data URI with the correct MIME type.
Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.).
Args:
file_path: Path to the image file on disk.
fallback_mime: MIME type used when the file extension is unknown.
Returns:
A valid data URI string (e.g., data:image/webp;base64,...).
Raises:
FileNotFoundError: If the file does not exist.
OSError: If reading the file fails.
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"Image file not found: {file_path}")
extension = os.path.splitext(file_path)[1].lower()
mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)
if mime_type == fallback_mime:
print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
f"Using fallback MIME type: {fallback_mime}")
try:
with open(file_path, "rb") as img_file:
encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
except OSError as e:
raise OSError(f"Failed to read image file '{file_path}': {e}") from e
return f"data:{mime_type};base64,{encoded_data}"
# --- Main Logic for Image Processing ---
# 1. Create a list containing all image paths
image_paths = [
r"c:/document_root/vlmrag/images/0000_ksisj.jpeg",
r'c:/document_root/vlmrag/images/0007_kjeoib.jpeg',
r'c:/document_root/vlmrag/images/0001_poiqwnd.jpeg',
r'c:/document_root/vlmrag/images/0002_hhsgcf.jpeg',
r'c:/document_root/vlmrag/images/0010_wdvwd.jpeg',
# Add more image paths here if needed
]
# 2. Create an empty list to store the message objects (images and text)
images_messages = []
# 3. Loop through the image path list, convert each image to a Data URI,
# and add it to the message list as an image_url object.
for path in image_paths:
data_uri = image_to_base64_data_uri(path)
images_messages.append({"type": "image_url", "image_url": {"url": data_uri}})
# 4. Add the final text prompt at the end of the list
images_messages.append({"type": "text", "text": "GGUF形式のファイルは使えますか?。"})
# 5. Use this list to build the chat_completion request
res = llm.create_chat_completion(
messages=[
#{"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."},
{"role": "system", "content": "あなたはAI博士です。"},
# The user's content is the list containing both images and text
{"role": "user", "content": images_messages}
]
)
# Print the assistant's response
print(res["choices"][0]["message"]["content"])However if same code is assembled into my WEB-UI code, model is never loaded on GPU.
import os, glob, sys, pickle, codecs
import time
from typing import List, Dict, Any, Optional
import torch
import numpy as np
from PIL import Image
#from transformers.utils.import_utils import is_flash_attn_2_available
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor
from transformers import LogitsProcessor, LogitsProcessorList
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance, VectorParams, PointStruct,
MultiVectorConfig, MultiVectorComparator
)
import gradio as gr
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Qwen3VLChatHandler
import base64
os.environ["NO_PROXY"] = "localhost, 127.0.0.1/8, ::1"
MODEL_PATH = r"./Qwen3VL-8B-Instruct-Q8_0.gguf"
MMPROJ_PATH = r"./mmproj-Qwen3VL-8B-Instruct-Q8_0.gguf"
SRV_IP = '10.xx.xxx.27' # Gradio IP
SRV_PT = 8080 # Gradio port
URL = '127.0.0.1:6333' # IP & port for Qdrant server
# グローバルスコープにすべく先に宣言
SETTING_VISIBLE = True
DOCUMENT_ROOT = 'c:/document_root'
pdf_paths = []
img_path = None
collection_name = ""
all_collections = []
paths = []
KN = 5
print(f"Loading VLM model: {MODEL_PATH}")
t0 = time.perf_counter()
# guff形式のVLMモデル本体をロード(生成用)
llm = Llama(
model_path=MODEL_PATH,
# Set up the chat handler for Qwen3-VL, specifying the projector path
chat_handler=Qwen3VLChatHandler(
clip_model_path=MMPROJ_PATH,
#force_reasoning=True,
verbose=False,
image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
),
n_gpu_layer=-1,
n_ctx=15360, #1000,
swa_full=True,
verbose=True, #False,
)
import atexit
@atexit.register
def free_model():
llm._sampler.close()
llm.close()
print(f"VLM loaded in {time.perf_counter() - t0:.2f}s")
#=====================
# 画像をbase64で変換する際に必要なファイル形式に関する定数
# Comprehensive MIME type mapping (updated as of 2025)
# Based on Pillow 10.x+ "Fully Supported" (Read & Write) formats
# Reference: IANA official media types + common real-world usage
# See: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html
_IMAGE_MIME_TYPES = {
# Most common formats
'.png': 'image/png',
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.gif': 'image/gif',
'.webp': 'image/webp',
# Next-generation formats
'.avif': 'image/avif',
'.jp2': 'image/jp2',
'.j2k': 'image/jp2',
'.jpx': 'image/jp2',
# Legacy / Windows formats
'.bmp': 'image/bmp',
'.ico': 'image/x-icon',
'.pcx': 'image/x-pcx',
'.tga': 'image/x-tga',
'.icns': 'image/icns',
# Professional / Scientific imaging
'.tif': 'image/tiff',
'.tiff': 'image/tiff',
'.eps': 'application/postscript',
'.dds': 'image/vnd-ms.dds',
'.dib': 'image/dib',
'.sgi': 'image/sgi',
# Portable Map formats (PPM/PGM/PBM)
'.pbm': 'image/x-portable-bitmap',
'.pgm': 'image/x-portable-graymap',
'.ppm': 'image/x-portable-pixmap',
# Miscellaneous / Older formats
'.xbm': 'image/x-xbitmap',
'.mpo': 'image/mpo',
'.msp': 'image/msp',
'.im': 'image/x-pillow-im',
'.qoi': 'image/qoi',
}
def image_to_base64_data_uri(
file_path: str,
*,
fallback_mime: str = "application/octet-stream"
) -> str:
"""
Convert a local image file to a base64-encoded data URI with the correct MIME type.
Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.).
Args:
file_path: Path to the image file on disk.
fallback_mime: MIME type used when the file extension is unknown.
Returns:
A valid data URI string (e.g., data:image/webp;base64,...).
Raises:
FileNotFoundError: If the file does not exist.
OSError: If reading the file fails.
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"Image file not found: {file_path}")
extension = os.path.splitext(file_path)[1].lower()
mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime)
if mime_type == fallback_mime:
print(f"Warning: Unknown extension '{extension}' for '{file_path}'. "
f"Using fallback MIME type: {fallback_mime}")
try:
with open(file_path, "rb") as img_file:
encoded_data = base64.b64encode(img_file.read()).decode("utf-8")
except OSError as e:
raise OSError(f"Failed to read image file '{file_path}': {e}") from e
return f"data:{mime_type};base64,{encoded_data}"
class PresencePenaltyProcessor(LogitsProcessor):
"""
Apply a presence penalty: discourage generating tokens that have already appeared
in the generated sequence (not frequency-based, but presence-based).
This mimics OpenAI-style presence_penalty in a simple way by subtracting a fixed
penalty from logits of any token present at least once in the generated tokens.
"""
def __init__(self, presence_penalty: float):
super().__init__()
if presence_penalty < 0:
raise ValueError("presence_penalty must be >= 0.")
self.presence_penalty = presence_penalty
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
# input_ids shape: (batch, cur_len)
# scores shape: (batch, vocab_size)
batch_size = input_ids.shape[0]
for b in range(batch_size):
seen = set(input_ids[b].tolist())
if len(seen) == 0:
continue
# Subtract penalty from logits of seen tokens
# Note: scores[b] is (vocab_size,)
# Efficient masking
indices = torch.tensor(list(seen), device=scores.device, dtype=torch.long)
# Clamp indices to valid range just in case
indices = indices[(indices >= 0) & (indices < scores.shape[-1])]
if indices.numel() > 0:
scores[b, indices] -= self.presence_penalty
return scores
# ---------- Qdrant----------
class QdrantVectorStore:
def __init__(
self,
collection_name: str = collection_name,
timeout: float = 120.0,
url=URL,
):
#.................................code continues.................................As you can tell, transformers, colpali, Gradio and Qdrant are used in this RAG WEB-UI system.
It takes time, but code itself works OK and gives reasonable answer with CPU.
Can you guess why GGUF model never loaded on GPU with 2nd code ?
environment:
Windows11 pro 24H2
GPU NVIDIA RTX3090
python 3.11
torch 2.5.1+cu124
llama_cpp_python 0.3.23
Thanks in advance.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels