From 1fccb783f279923bdffe85f166fdb853d8c45673 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 28 Dec 2025 10:02:19 +0100 Subject: [PATCH 1/3] feat(matroska): Add VOBSUB subtitle extraction support for MKV files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, CCExtractor would only print "Error: VOBSUB not supported" when encountering VOBSUB (S_VOBSUB) subtitle tracks in Matroska files. This left users without any usable output. This commit adds full VOBSUB extraction support: - Generate proper .idx index files with timestamps and file positions - Generate proper .sub files with PS-wrapped SPU data - Correct PS Pack header with SCR derived from timestamps - Correct PES header with PTS for each subtitle - 2048-byte block alignment (standard VOBSUB format) The output is compatible with VLC, FFmpeg, and other players that support VobSub subtitle format. Tested with sample from issue #1371 - output validates correctly with FFprobe and produces identical subtitle data to mkvextract. Fixes #1371 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/lib_ccx/matroska.c | 243 +++++++++++++++++++++++++++++++++++++++-- src/lib_ccx/matroska.h | 5 + 2 files changed, 239 insertions(+), 9 deletions(-) diff --git a/src/lib_ccx/matroska.c b/src/lib_ccx/matroska.c index 077aeef0d..f08ed6143 100644 --- a/src/lib_ccx/matroska.c +++ b/src/lib_ccx/matroska.c @@ -1334,11 +1334,245 @@ char *ass_ssa_sentence_erase_read_order(char *text) return buf; } +/* VOBSUB support: Generate PS Pack header + * The PS Pack header is 14 bytes: + * - 4 bytes: start code (00 00 01 ba) + * - 6 bytes: SCR (System Clock Reference) in MPEG-2 format + * - 3 bytes: mux rate + * - 1 byte: stuffing length (0) + */ +static void generate_ps_pack_header(unsigned char *buf, ULLONG pts_90khz) +{ + // PS Pack start code + buf[0] = 0x00; + buf[1] = 0x00; + buf[2] = 0x01; + buf[3] = 0xBA; + + // SCR (System Clock Reference) - use PTS as SCR base, SCR extension = 0 + // MPEG-2 format: 01 SCR[32:30] 1 SCR[29:15] 1 SCR[14:0] 1 SCR_ext[8:0] 1 + ULLONG scr = pts_90khz; + ULLONG scr_base = scr; + int scr_ext = 0; + + buf[4] = 0x44 | ((scr_base >> 27) & 0x38) | ((scr_base >> 28) & 0x03); + buf[5] = (scr_base >> 20) & 0xFF; + buf[6] = 0x04 | ((scr_base >> 12) & 0xF8) | ((scr_base >> 13) & 0x03); + buf[7] = (scr_base >> 5) & 0xFF; + buf[8] = 0x04 | ((scr_base << 3) & 0xF8) | ((scr_ext >> 7) & 0x03); + buf[9] = ((scr_ext << 1) & 0xFE) | 0x01; + + // Mux rate (10080 = standard DVD rate) + int mux_rate = 10080; + buf[10] = (mux_rate >> 14) & 0xFF; + buf[11] = (mux_rate >> 6) & 0xFF; + buf[12] = ((mux_rate << 2) & 0xFC) | 0x03; + + // Stuffing length = 0, with marker bits + buf[13] = 0xF8; +} + +/* VOBSUB support: Generate PES header for private stream 1 + * Returns the total header size (variable based on PTS) + */ +static int generate_pes_header(unsigned char *buf, ULLONG pts_90khz, int payload_size, int stream_id) +{ + // PES start code for private stream 1 + buf[0] = 0x00; + buf[1] = 0x00; + buf[2] = 0x01; + buf[3] = 0xBD; // Private stream 1 + + // PES packet length = header data (3 + 5 for PTS) + 1 (substream ID) + payload + int pes_header_data_len = 5; // PTS only + int pes_packet_len = 3 + pes_header_data_len + 1 + payload_size; + buf[4] = (pes_packet_len >> 8) & 0xFF; + buf[5] = pes_packet_len & 0xFF; + + // PES flags: MPEG-2, original + buf[6] = 0x81; + // PTS_DTS_flags = 10 (PTS only) + buf[7] = 0x80; + // PES header data length + buf[8] = pes_header_data_len; + + // PTS (5 bytes): '0010' | PTS[32:30] | '1' | PTS[29:15] | '1' | PTS[14:0] | '1' + buf[9] = 0x21 | ((pts_90khz >> 29) & 0x0E); + buf[10] = (pts_90khz >> 22) & 0xFF; + buf[11] = 0x01 | ((pts_90khz >> 14) & 0xFE); + buf[12] = (pts_90khz >> 7) & 0xFF; + buf[13] = 0x01 | ((pts_90khz << 1) & 0xFE); + + // Substream ID (0x20 = first VOBSUB stream) + buf[14] = 0x20 + stream_id; + + return 15; // Total PES header size +} + +/* VOBSUB support: Generate timestamp string for .idx file + * Format: HH:MM:SS:mmm (where mmm is milliseconds) + */ +static void generate_vobsub_timestamp(char *buf, size_t bufsize, ULLONG milliseconds) +{ + ULLONG ms = milliseconds % 1000; + milliseconds /= 1000; + ULLONG seconds = milliseconds % 60; + milliseconds /= 60; + ULLONG minutes = milliseconds % 60; + milliseconds /= 60; + ULLONG hours = milliseconds; + + snprintf(buf, bufsize, "%02" LLU_M ":%02" LLU_M ":%02" LLU_M ":%03" LLU_M, + hours, minutes, seconds, ms); +} + +/* VOBSUB support: Save VOBSUB track to .idx and .sub files */ +static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track) +{ + if (track->sentence_count == 0) + { + mprint("\nNo VOBSUB subtitles to write"); + return; + } + + // Generate base filename (without extension) + const char *lang_to_use = track->lang_ietf ? track->lang_ietf : track->lang; + const char *basename = get_basename(mkv_ctx->filename); + size_t needed = strlen(basename) + strlen(lang_to_use) + 32; + char *base_filename = malloc(needed); + if (base_filename == NULL) + fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory."); + + if (track->lang_index == 0) + snprintf(base_filename, needed, "%s_%s", basename, lang_to_use); + else + snprintf(base_filename, needed, "%s_%s_" LLD, basename, lang_to_use, track->lang_index); + + // Create .sub filename + char *sub_filename = malloc(needed + 5); + if (sub_filename == NULL) + fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory."); + snprintf(sub_filename, needed + 5, "%s.sub", base_filename); + + // Create .idx filename + char *idx_filename = malloc(needed + 5); + if (idx_filename == NULL) + fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory."); + snprintf(idx_filename, needed + 5, "%s.idx", base_filename); + + mprint("\nOutput files: %s, %s", idx_filename, sub_filename); + + // Open .sub file + int sub_desc; +#ifdef WIN32 + sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IREAD | S_IWRITE); +#else + sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR); +#endif + if (sub_desc < 0) + { + mprint("\nError: Cannot create .sub file"); + free(base_filename); + free(sub_filename); + free(idx_filename); + return; + } + + // Open .idx file + int idx_desc; +#ifdef WIN32 + idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE); +#else + idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR); +#endif + if (idx_desc < 0) + { + mprint("\nError: Cannot create .idx file"); + close(sub_desc); + free(base_filename); + free(sub_filename); + free(idx_filename); + return; + } + + // Write .idx header (from CodecPrivate) + if (track->header != NULL) + write_wrapped(idx_desc, track->header, strlen(track->header)); + + // Add language identifier line + char lang_line[128]; + snprintf(lang_line, sizeof(lang_line), "\nid: %s, index: 0\n", lang_to_use); + write_wrapped(idx_desc, lang_line, strlen(lang_line)); + + // Block size for alignment (2048 bytes = 0x800) + const int VOBSUB_BLOCK_SIZE = 2048; + + // Buffer for PS/PES headers and padding + unsigned char header_buf[32]; + unsigned char zero_buf[VOBSUB_BLOCK_SIZE]; + memset(zero_buf, 0, VOBSUB_BLOCK_SIZE); + + ULLONG file_pos = 0; + + // Write each subtitle + for (int i = 0; i < track->sentence_count; i++) + { + struct matroska_sub_sentence *sentence = track->sentences[i]; + mkv_ctx->sentence_count++; + + // Convert timestamp to 90kHz PTS + ULLONG pts_90khz = sentence->time_start * 90; + + // Write timestamp entry to .idx + char timestamp[32]; + generate_vobsub_timestamp(timestamp, sizeof(timestamp), sentence->time_start); + char idx_entry[128]; + snprintf(idx_entry, sizeof(idx_entry), "timestamp: %s, filepos: %09" LLX_M "\n", + timestamp, file_pos); + write_wrapped(idx_desc, idx_entry, strlen(idx_entry)); + + // Generate PS Pack header (14 bytes) + generate_ps_pack_header(header_buf, pts_90khz); + write_wrapped(sub_desc, (char *)header_buf, 14); + + // Generate PES header (15 bytes) + int pes_header_len = generate_pes_header(header_buf, pts_90khz, sentence->text_size, 0); + write_wrapped(sub_desc, (char *)header_buf, pes_header_len); + + // Write SPU data + write_wrapped(sub_desc, sentence->text, sentence->text_size); + + // Calculate bytes written and pad to block boundary + ULLONG bytes_written = 14 + pes_header_len + sentence->text_size; + ULLONG padding_needed = VOBSUB_BLOCK_SIZE - (bytes_written % VOBSUB_BLOCK_SIZE); + if (padding_needed < VOBSUB_BLOCK_SIZE) + { + write_wrapped(sub_desc, (char *)zero_buf, padding_needed); + bytes_written += padding_needed; + } + + file_pos += bytes_written; + } + + close(sub_desc); + close(idx_desc); + free(base_filename); + free(sub_filename); + free(idx_filename); +} + void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track) { char *filename; int desc; + // VOBSUB tracks need special handling - separate .idx and .sub files + if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB) + { + save_vobsub_track(mkv_ctx, track); + return; + } + if (mkv_ctx->ctx->cc_to_stdout == CCX_TRUE) { desc = 1; // file descriptor of stdout @@ -1358,11 +1592,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra if (track->header != NULL) write_wrapped(desc, track->header, strlen(track->header)); - if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB) - { - mprint("\nError: VOBSUB not supported"); - } - for (int i = 0; i < track->sentence_count; i++) { struct matroska_sub_sentence *sentence = track->sentences[i]; @@ -1497,10 +1726,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra free(timestamp_start); free(timestamp_end); } - else if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB) - { - // TODO: Add support for VOBSUB - } } } diff --git a/src/lib_ccx/matroska.h b/src/lib_ccx/matroska.h index 7ea8b82a5..0a6dde7ab 100644 --- a/src/lib_ccx/matroska.h +++ b/src/lib_ccx/matroska.h @@ -5,26 +5,31 @@ #if (defined(WIN32) || defined(_WIN32_WCE)) && (defined(__MINGW32__) || !defined(__GNUC__)) #define LLD_M "I64d" #define LLU_M "I64u" +#define LLX_M "I64x" #define LLD "%I64d" #define LLU "%I64u" #elif defined(__SYMBIAN32__) #define LLD_M "d" #define LLU_M "u" +#define LLX_M "x" #define LLD "%d" #define LLU "%u" #elif defined(__DARWIN__) || defined(__APPLE__) #define LLD_M "lld" #define LLU_M "llu" +#define LLX_M "llx" #define LLD "%lld" #define LLU "%llu" #elif defined(_LP64) /* Unix 64 bits */ #define LLD_M "ld" #define LLU_M "lu" +#define LLX_M "lx" #define LLD "%ld" #define LLU "%lu" #else /* Unix 32 bits */ #define LLD_M "lld" #define LLU_M "llu" +#define LLX_M "llx" #define LLD "%lld" #define LLU "%llu" #endif From 6f2a73d706bf09a2f97289bdbbf14a25fb2f61af Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 28 Dec 2025 10:26:41 +0100 Subject: [PATCH 2/3] docs: Add VOBSUB extraction documentation and subtile-ocr Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add docs/VOBSUB.md explaining the VOBSUB extraction workflow - Add tools/vobsubocr/Dockerfile for building subtile-ocr OCR tool - Document how to convert VOBSUB (.idx/.sub) to SRT using OCR The Dockerfile uses subtile-ocr (https://github.com/gwen-lg/subtile-ocr), an actively maintained fork of vobsubocr with better accuracy. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/VOBSUB.md | 129 +++++++++++++++++++++++++++++++++++++ tools/vobsubocr/Dockerfile | 35 ++++++++++ 2 files changed, 164 insertions(+) create mode 100644 docs/VOBSUB.md create mode 100644 tools/vobsubocr/Dockerfile diff --git a/docs/VOBSUB.md b/docs/VOBSUB.md new file mode 100644 index 000000000..879c5614e --- /dev/null +++ b/docs/VOBSUB.md @@ -0,0 +1,129 @@ +# VOBSUB Subtitle Extraction from MKV Files + +CCExtractor supports extracting VOBSUB (S_VOBSUB) subtitles from Matroska (MKV) containers. VOBSUB is an image-based subtitle format originally from DVD video. + +## Overview + +VOBSUB subtitles consist of two files: +- `.idx` - Index file containing metadata, palette, and timestamp/position entries +- `.sub` - Binary file containing the actual subtitle bitmap data in MPEG Program Stream format + +## Basic Usage + +```bash +ccextractor movie.mkv +``` + +This will extract all VOBSUB tracks and create paired `.idx` and `.sub` files: +- `movie_eng.idx` + `movie_eng.sub` (first English track) +- `movie_eng_1.idx` + `movie_eng_1.sub` (second English track, if present) +- etc. + +## Converting VOBSUB to SRT (Text) + +Since VOBSUB subtitles are images, you need OCR (Optical Character Recognition) to convert them to text-based formats like SRT. + +### Using subtile-ocr (Recommended) + +[subtile-ocr](https://github.com/gwen-lg/subtile-ocr) is an actively maintained Rust tool that provides accurate OCR conversion. + +#### Option 1: Docker (Easiest) + +We provide a Dockerfile that builds subtile-ocr with all dependencies: + +```bash +# Build the Docker image (one-time) +cd tools/vobsubocr +docker build -t subtile-ocr . + +# Extract VOBSUB from MKV +ccextractor movie.mkv + +# Convert to SRT using OCR +docker run --rm -v $(pwd):/data subtile-ocr -l eng -o /data/movie_eng.srt /data/movie_eng.idx +``` + +#### Option 2: Install subtile-ocr Natively + +If you have Rust and Tesseract development libraries installed: + +```bash +# Install dependencies (Ubuntu/Debian) +sudo apt-get install libleptonica-dev libtesseract-dev tesseract-ocr tesseract-ocr-eng + +# Install subtile-ocr +cargo install --git https://github.com/gwen-lg/subtile-ocr + +# Convert +subtile-ocr -l eng -o movie_eng.srt movie_eng.idx +``` + +### subtile-ocr Options + +| Option | Description | +|--------|-------------| +| `-l, --lang ` | Tesseract language code (required). Examples: `eng`, `fra`, `deu`, `chi_sim` | +| `-o, --output ` | Output SRT file (stdout if not specified) | +| `-t, --threshold <0.0-1.0>` | Binarization threshold (default: 0.6) | +| `-d, --dpi ` | Image DPI for OCR (default: 150) | +| `--dump` | Save processed subtitle images as PNG files | + +### Language Codes + +Install additional Tesseract language packs as needed: + +```bash +# Examples +sudo apt-get install tesseract-ocr-fra # French +sudo apt-get install tesseract-ocr-deu # German +sudo apt-get install tesseract-ocr-spa # Spanish +sudo apt-get install tesseract-ocr-chi-sim # Simplified Chinese +``` + +## Technical Details + +### .idx File Format + +The index file contains: +1. Header with metadata (size, palette, alignment settings) +2. Language identifier line +3. Timestamp entries with file positions + +Example: +``` +# VobSub index file, v7 (do not modify this line!) +size: 720x576 +palette: 000000, 828282, ... + +id: eng, index: 0 +timestamp: 00:01:12:920, filepos: 000000000 +timestamp: 00:01:18:640, filepos: 000000800 +... +``` + +### .sub File Format + +The binary file contains MPEG Program Stream packets: +- Each subtitle is wrapped in a PS Pack header (14 bytes) + PES header (15 bytes) +- Subtitles are aligned to 2048-byte boundaries +- Contains raw SPU (SubPicture Unit) bitmap data + +## Troubleshooting + +### Empty output files +- Ensure the MKV file actually contains VOBSUB tracks (check with `mediainfo` or `ffprobe`) +- CCExtractor will report "No VOBSUB subtitles to write" if the track is empty + +### OCR quality issues +- Try adjusting the `-t` threshold parameter +- Ensure the correct language pack is installed +- Use `--dump` to inspect the processed images + +### Docker permission issues +- The output files may be owned by root; use `sudo chown` to fix ownership +- Or run Docker with `--user $(id -u):$(id -g)` + +## See Also + +- [OCR.md](OCR.md) - General OCR support in CCExtractor +- [subtile-ocr GitHub](https://github.com/gwen-lg/subtile-ocr) - OCR tool documentation diff --git a/tools/vobsubocr/Dockerfile b/tools/vobsubocr/Dockerfile new file mode 100644 index 000000000..b3ba06731 --- /dev/null +++ b/tools/vobsubocr/Dockerfile @@ -0,0 +1,35 @@ +# Dockerfile for subtile-ocr - VOBSUB to SRT converter +# Uses subtile-ocr, an actively maintained fork of vobsubocr +# https://github.com/gwen-lg/subtile-ocr + +FROM ubuntu:22.04 + +# Prevent interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install build dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + clang \ + pkg-config \ + libleptonica-dev \ + libtesseract-dev \ + tesseract-ocr \ + tesseract-ocr-eng \ + curl \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install subtile-ocr from git +RUN cargo install --git https://github.com/gwen-lg/subtile-ocr + +# Create working directory +WORKDIR /data + +# Default command shows help +ENTRYPOINT ["subtile-ocr"] +CMD ["--help"] From 9d14766b0ddd0650a853dd5d053ee2cbb592eb7a Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 28 Dec 2025 11:32:48 +0100 Subject: [PATCH 3/3] fix: Use #define instead of const int for VOBSUB_BLOCK_SIZE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSVC doesn't support variable-length arrays (VLAs). The const int declaration wasn't being treated as a compile-time constant, causing Windows build failure with errors C2057, C2466, C2133. Changed to #define which is a true compile-time constant. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/lib_ccx/matroska.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/lib_ccx/matroska.c b/src/lib_ccx/matroska.c index f08ed6143..1b5cfe912 100644 --- a/src/lib_ccx/matroska.c +++ b/src/lib_ccx/matroska.c @@ -1427,6 +1427,7 @@ static void generate_vobsub_timestamp(char *buf, size_t bufsize, ULLONG millisec } /* VOBSUB support: Save VOBSUB track to .idx and .sub files */ +#define VOBSUB_BLOCK_SIZE 2048 static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track) { if (track->sentence_count == 0) @@ -1504,9 +1505,6 @@ static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_ snprintf(lang_line, sizeof(lang_line), "\nid: %s, index: 0\n", lang_to_use); write_wrapped(idx_desc, lang_line, strlen(lang_line)); - // Block size for alignment (2048 bytes = 0x800) - const int VOBSUB_BLOCK_SIZE = 2048; - // Buffer for PS/PES headers and padding unsigned char header_buf[32]; unsigned char zero_buf[VOBSUB_BLOCK_SIZE];