From da3dc52b45dee5388cd17971f05ee6aaa5ede5fd Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 21 Dec 2025 08:05:14 +0100 Subject: [PATCH 1/3] fix(708): Support Korean EUC-KR encoding in CEA-708 decoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Korean broadcasts use EUC-KR encoding (variable-width) in CEA-708 captions, where ASCII is 1 byte and Korean characters are 2 bytes. The decoder was always writing 2 bytes per character (UTF-16BE style), causing NULL bytes to be inserted before every ASCII character. Changes: - Add is_utf16_charset() to detect fixed-width 16-bit encodings - Modify write_char() to accept use_utf16 flag: - true: Always 2 bytes (UTF-16BE for Japanese, issue #1451) - false: 1 byte for ASCII, 2 bytes for extended (EUC-KR for Korean) - Detect charset type in write_row() before building output buffer This fixes Korean subtitle extraction when using --service "1[EUC-KR]" while maintaining compatibility with Japanese UTF-16BE (issue #1451). Closes #1065 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/rust/src/decoder/output.rs | 98 ++++++++++++++++++++++++++----- src/rust/src/decoder/tv_screen.rs | 21 ++++++- 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/src/rust/src/decoder/output.rs b/src/rust/src/decoder/output.rs index 02e0356ba..20ee03133 100644 --- a/src/rust/src/decoder/output.rs +++ b/src/rust/src/decoder/output.rs @@ -85,14 +85,38 @@ impl<'a> Writer<'a> { /// Write the symbol to the provided buffer /// -/// Always writes 2 bytes for consistent UTF-16BE encoding. -/// Previously, this function wrote 1 byte for ASCII characters and 2 bytes -/// for non-ASCII, creating an invalid mix that encoding conversion couldn't -/// handle properly. This caused garbled output with Japanese/Chinese characters -/// (issue #1451). -pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec) { - buf.push((sym.sym >> 8) as u8); - buf.push((sym.sym & 0xff) as u8); +/// The `use_utf16` parameter controls the output format: +/// - `true`: Always writes 2 bytes (UTF-16BE format). Use for UTF-16/UCS-2 charsets. +/// - `false`: Writes 1 byte for ASCII (high byte == 0), 2 bytes for extended chars. +/// Use for variable-width encodings like EUC-KR, CP949, Shift-JIS, etc. +/// +/// Issue #1451: Japanese/Chinese with UTF-16BE need 2 bytes for all characters. +/// Issue #1065: Korean with EUC-KR needs 1 byte for ASCII, 2 bytes for Korean. +pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec, use_utf16: bool) { + let high = (sym.sym >> 8) as u8; + let low = (sym.sym & 0xff) as u8; + + if use_utf16 { + // UTF-16BE: Always write 2 bytes + buf.push(high); + buf.push(low); + } else { + // Variable-width: Only write high byte if non-zero + if high != 0 { + buf.push(high); + } + buf.push(low); + } +} + +/// Check if a charset name indicates UTF-16 or UCS-2 encoding +/// +/// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes. +pub fn is_utf16_charset(charset: &str) -> bool { + let upper = charset.to_uppercase(); + upper.contains("UTF-16") || upper.contains("UTF16") || + upper.contains("UCS-2") || upper.contains("UCS2") || + upper.contains("UTF_16") || upper.contains("UCS_2") } /// Convert from CEA-708 color representation to hex code @@ -114,27 +138,71 @@ mod tests { use super::*; #[test] - fn test_write_char() { + fn test_write_char_utf16_mode() { let mut buf = Vec::new(); - // Write ASCII symbol - UTF-16BE always uses 2 bytes - // 'A' (0x41) becomes [0x00, 0x41] in UTF-16BE + // UTF-16 mode: ASCII symbol 'A' (0x41) becomes [0x00, 0x41] let sym = dtvcc_symbol { sym: 0x41, init: 0 }; - write_char(&sym, &mut buf); + write_char(&sym, &mut buf, true); assert_eq!(buf, vec![0x00, 0x41]); buf.clear(); - // Write non-ASCII symbol (e.g., Japanese character) - // Already 16-bit, writes as [high_byte, low_byte] + // UTF-16 mode: Non-ASCII symbol writes as [high_byte, low_byte] let sym = dtvcc_symbol { sym: 0x1234, init: 0, }; - write_char(&sym, &mut buf); + write_char(&sym, &mut buf, true); assert_eq!(buf, vec![0x12, 0x34]); } + #[test] + fn test_write_char_variable_width_mode() { + let mut buf = Vec::new(); + + // Variable-width mode: ASCII symbol 'A' (0x41) becomes [0x41] (1 byte) + let sym = dtvcc_symbol { sym: 0x41, init: 0 }; + write_char(&sym, &mut buf, false); + assert_eq!(buf, vec![0x41]); + + buf.clear(); + + // Variable-width mode: Korean EUC-KR char becomes [high, low] (2 bytes) + // Example: Korean '인' = 0xC0CE in EUC-KR + let sym = dtvcc_symbol { + sym: 0xC0CE, + init: 0, + }; + write_char(&sym, &mut buf, false); + assert_eq!(buf, vec![0xC0, 0xCE]); + + buf.clear(); + + // Variable-width mode: Space (0x20) becomes [0x20] (1 byte, no NUL) + let sym = dtvcc_symbol { sym: 0x20, init: 0 }; + write_char(&sym, &mut buf, false); + assert_eq!(buf, vec![0x20]); + } + + #[test] + fn test_is_utf16_charset() { + // Should return true for UTF-16 variants + assert!(is_utf16_charset("UTF-16BE")); + assert!(is_utf16_charset("UTF-16LE")); + assert!(is_utf16_charset("utf-16")); + assert!(is_utf16_charset("UTF16")); + assert!(is_utf16_charset("UCS-2")); + assert!(is_utf16_charset("UCS2")); + + // Should return false for variable-width encodings + assert!(!is_utf16_charset("EUC-KR")); + assert!(!is_utf16_charset("CP949")); + assert!(!is_utf16_charset("Shift-JIS")); + assert!(!is_utf16_charset("UTF-8")); + assert!(!is_utf16_charset("ISO-8859-1")); + } + #[test] fn test_color_to_hex() { assert_eq!(color_to_hex(0b00_00_00), (0, 0, 0)); // Black diff --git a/src/rust/src/decoder/tv_screen.rs b/src/rust/src/decoder/tv_screen.rs index aede8e608..7ee5ec532 100644 --- a/src/rust/src/decoder/tv_screen.rs +++ b/src/rust/src/decoder/tv_screen.rs @@ -13,7 +13,7 @@ use std::{ffi::CStr, fs::File}; #[cfg(windows)] use crate::bindings::_get_osfhandle; -use super::output::{color_to_hex, write_char, Writer}; +use super::output::{color_to_hex, is_utf16_charset, write_char, Writer}; use super::timing::{get_scc_time_str, get_time_str}; use super::{CCX_DTVCC_SCREENGRID_COLUMNS, CCX_DTVCC_SCREENGRID_ROWS}; use crate::{ @@ -177,6 +177,23 @@ impl dtvcc_tv_screen { let (first, last) = self.get_write_interval(row_index); debug!("First: {first}, Last: {last}"); + // Determine if we should use UTF-16 mode (2 bytes for all chars) or + // variable-width mode (1 byte for ASCII, 2 bytes for extended chars). + // UTF-16/UCS-2 encodings require 2 bytes even for ASCII. + // Variable-width encodings (EUC-KR, CP949, Shift-JIS, etc.) use 1 byte for ASCII. + let use_utf16 = if !writer.writer_ctx.charset.is_null() { + let charset = unsafe { + CStr::from_ptr(writer.writer_ctx.charset) + .to_str() + .unwrap_or("") + }; + is_utf16_charset(charset) + } else { + // No charset specified - default to variable-width for backward compatibility + // with raw byte output (no encoding conversion) + false + }; + for i in 0..last + 1 { if use_colors { self.change_pen_color( @@ -219,7 +236,7 @@ impl dtvcc_tv_screen { if i < first { buf.push(b' '); } else { - write_char(&self.chars[row_index][i], &mut buf) + write_char(&self.chars[row_index][i], &mut buf, use_utf16) } } // there can be unclosed tags or colors after the last symbol in a row From d0caf23a82af9d07c2f264860dc9d94273919e39 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 21 Dec 2025 08:35:13 +0100 Subject: [PATCH 2/3] fix(timing): Use i64 instead of c_long for Windows compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Rust FFI functions were using c_long for PTS/FTS timestamps, but: - C code uses LLONG (int64_t, 64 bits on all platforms) - Rust c_long is 32 bits on Windows, 64 bits on Linux This caused timestamp truncation on Windows when PTS values exceeded 2^31 (~24 days at 90kHz), resulting in wrong subtitle timestamps. For example, a file with Min PTS of 23:50:45 (7,726,090,500 ticks) would have its PTS truncated, breaking the teletext delta calculation that normalizes timestamps to start at 0. Changes: - ccxr_add_current_pts: pts parameter i64 - ccxr_set_current_pts: pts parameter i64 - ccxr_get_fts: return type i64 - ccxr_get_visible_end: return type i64 - ccxr_get_visible_start: return type i64 - ccxr_get_fts_max: return type i64 - ccxr_print_mstime_static: mstime parameter i64 - fts_at_gop_start: extern static i64 Fixes tests 18 and 19 on Windows CI which showed raw PTS timestamps (23:50:46) instead of normalized timestamps (00:00:00). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/rust/src/decoder/output.rs | 9 ++++++--- src/rust/src/es/pic.rs | 6 +----- src/rust/src/lib.rs | 6 +++--- src/rust/src/libccxr_exports/time.rs | 28 ++++++++++++++-------------- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/rust/src/decoder/output.rs b/src/rust/src/decoder/output.rs index 20ee03133..7cc2b6a71 100644 --- a/src/rust/src/decoder/output.rs +++ b/src/rust/src/decoder/output.rs @@ -114,9 +114,12 @@ pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec, use_utf16: bool) { /// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes. pub fn is_utf16_charset(charset: &str) -> bool { let upper = charset.to_uppercase(); - upper.contains("UTF-16") || upper.contains("UTF16") || - upper.contains("UCS-2") || upper.contains("UCS2") || - upper.contains("UTF_16") || upper.contains("UCS_2") + upper.contains("UTF-16") + || upper.contains("UTF16") + || upper.contains("UCS-2") + || upper.contains("UCS2") + || upper.contains("UTF_16") + || upper.contains("UCS_2") } /// Convert from CEA-708 color representation to hex code diff --git a/src/rust/src/es/pic.rs b/src/rust/src/es/pic.rs index d9bfea66a..89c3344b9 100644 --- a/src/rust/src/es/pic.rs +++ b/src/rust/src/es/pic.rs @@ -277,12 +277,8 @@ pub unsafe fn read_pic_info( // set_fts() is not called for each picture when use_gop_as_pts == 1. if ccx_options.use_gop_as_pts == 1 { // Calculate current FTS based on GOP start time + frame offset - // Cast fts_at_gop_start to i64 for cross-platform compatibility (c_long is i32 on Windows) let frame_offset_ms = (dec_ctx.frames_since_last_gop as f64 * 1000.0 / current_fps) as i64; - #[allow(clippy::unnecessary_cast)] - { - (*dec_ctx.timing).fts_now = (fts_at_gop_start as i64) + frame_offset_ms; - } + (*dec_ctx.timing).fts_now = fts_at_gop_start + frame_offset_ms; // Update fts_max if needed if (*dec_ctx.timing).fts_now > (*dec_ctx.timing).fts_max { diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs index 4e453f4e3..9f267c7b3 100644 --- a/src/rust/src/lib.rs +++ b/src/rust/src/lib.rs @@ -51,7 +51,7 @@ use std::os::raw::{c_uchar, c_void}; use std::{ ffi::CStr, io::Write, - os::raw::{c_char, c_double, c_int, c_long, c_uint}, + os::raw::{c_char, c_double, c_int, c_uint}, }; // Mock data for rust unit tests @@ -67,7 +67,7 @@ cfg_if! { static mut frames_since_ref_time: c_int = 0; static mut total_frames_count: c_uint = 0; - static mut fts_at_gop_start: c_long = 0; + static mut fts_at_gop_start: i64 = 0; static mut gop_rollover: c_int = 0; static mut pts_big_change: c_uint = 0; @@ -144,7 +144,7 @@ extern "C" { static mut total_frames_count: c_uint; static mut gop_time: gop_time_code; static mut first_gop_time: gop_time_code; - static mut fts_at_gop_start: c_long; + static mut fts_at_gop_start: i64; static mut gop_rollover: c_int; static mut ccx_common_timing_settings: ccx_common_timing_settings_t; static mut capitalization_list: word_list; diff --git a/src/rust/src/libccxr_exports/time.rs b/src/rust/src/libccxr_exports/time.rs index c0063916f..82df6838c 100644 --- a/src/rust/src/libccxr_exports/time.rs +++ b/src/rust/src/libccxr_exports/time.rs @@ -1,7 +1,7 @@ #![allow(clippy::useless_conversion)] use std::convert::TryInto; -use std::ffi::{c_char, c_int, c_long, CStr}; +use std::ffi::{c_char, c_int, CStr}; use crate::{ bindings::*, cb_708, cb_field1, cb_field2, ccx_common_timing_settings as timing_settings, @@ -330,7 +330,7 @@ unsafe fn write_back_from_timing_info() { .unwrap_or(0); gop_time = write_gop_time_code(timing_info.gop_time); first_gop_time = write_gop_time_code(timing_info.first_gop_time); - fts_at_gop_start = timing_info.fts_at_gop_start.millis() as c_long; + fts_at_gop_start = timing_info.fts_at_gop_start.millis(); gop_rollover = if timing_info.gop_rollover { 1 } else { 0 }; timing_settings.disable_sync_check = if timing_info.timing_settings.disable_sync_check { 1 @@ -412,7 +412,7 @@ pub unsafe fn write_gop_time_code(g: Option) -> gop_time_code { /// /// `ctx` must not be null. #[no_mangle] -pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, pts: c_long) { +pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, pts: i64) { apply_timing_info(); let mut context = generate_timing_context(ctx); @@ -428,7 +428,7 @@ pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, p /// /// `ctx` must not be null. #[no_mangle] -pub unsafe extern "C" fn ccxr_set_current_pts(ctx: *mut ccx_common_timing_ctx, pts: c_long) { +pub unsafe extern "C" fn ccxr_set_current_pts(ctx: *mut ccx_common_timing_ctx, pts: i64) { apply_timing_info(); let mut context = generate_timing_context(ctx); @@ -469,7 +469,7 @@ pub unsafe extern "C" fn ccxr_set_fts(ctx: *mut ccx_common_timing_ctx) -> c_int pub unsafe extern "C" fn ccxr_get_fts( ctx: *mut ccx_common_timing_ctx, current_field: c_int, -) -> c_long { +) -> i64 { apply_timing_info(); let mut context = generate_timing_context(ctx); @@ -485,7 +485,7 @@ pub unsafe extern "C" fn ccxr_get_fts( write_back_to_common_timing_ctx(ctx, &context); write_back_from_timing_info(); - ans.millis().try_into().unwrap_or(0) + ans.millis() } /// Rust equivalent for `get_visible_end` function in C. Uses C-native types as input and output. @@ -503,7 +503,7 @@ pub unsafe extern "C" fn ccxr_get_fts( pub unsafe extern "C" fn ccxr_get_visible_end( ctx: *mut ccx_common_timing_ctx, _current_field: c_int, -) -> c_long { +) -> i64 { apply_timing_info(); let mut context = generate_timing_context(ctx); @@ -518,7 +518,7 @@ pub unsafe extern "C" fn ccxr_get_visible_end( write_back_to_common_timing_ctx(ctx, &context); write_back_from_timing_info(); - fts as c_long + fts } /// Rust equivalent for `get_visible_start` function in C. Uses C-native types as input and output. @@ -537,7 +537,7 @@ pub unsafe extern "C" fn ccxr_get_visible_end( pub unsafe extern "C" fn ccxr_get_visible_start( ctx: *mut ccx_common_timing_ctx, _current_field: c_int, -) -> c_long { +) -> i64 { apply_timing_info(); let context = generate_timing_context(ctx); @@ -554,7 +554,7 @@ pub unsafe extern "C" fn ccxr_get_visible_start( write_back_to_common_timing_ctx(ctx, &context); write_back_from_timing_info(); - fts as c_long + fts } /// Rust equivalent for `get_fts_max` function in C. Uses C-native types as input and output. @@ -563,7 +563,7 @@ pub unsafe extern "C" fn ccxr_get_visible_start( /// /// `ctx` must not be null. #[no_mangle] -pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_long { +pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> i64 { apply_timing_info(); let mut context = generate_timing_context(ctx); @@ -572,7 +572,7 @@ pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_ write_back_to_common_timing_ctx(ctx, &context); write_back_from_timing_info(); - ans.millis().try_into().unwrap_or(0) + ans.millis() } /// Rust equivalent for `print_mstime_static` function in C. Uses C-native types as input and output. @@ -581,8 +581,8 @@ pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_ /// /// `buf` must not be null. It must have sufficient length to hold the time in string form. #[no_mangle] -pub unsafe extern "C" fn ccxr_print_mstime_static(mstime: c_long, buf: *mut c_char) -> *mut c_char { - let time = Timestamp::from_millis(mstime.into()); +pub unsafe extern "C" fn ccxr_print_mstime_static(mstime: i64, buf: *mut c_char) -> *mut c_char { + let time = Timestamp::from_millis(mstime); let ans = c::print_mstime_static(time, ':'); write_string_into_pointer(buf, &ans); buf From 73cd19f5d0ff3286718a5de296c54c33d727443a Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Date: Sun, 21 Dec 2025 09:30:19 +0100 Subject: [PATCH 3/3] fix(rust): Use i64 instead of c_long for Windows compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Windows, c_long is i32 while on Linux it's i64. The function ccxr_print_mstime_static expects i64, so casting to c_long caused a type mismatch error on Windows builds. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/rust/src/avc/nal.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rust/src/avc/nal.rs b/src/rust/src/avc/nal.rs index 4a2d2342b..767a9da0a 100644 --- a/src/rust/src/avc/nal.rs +++ b/src/rust/src/avc/nal.rs @@ -9,7 +9,7 @@ use crate::{ccx_options, current_fps, total_frames_count, MPEG_CLOCK_FREQ}; use lib_ccxr::common::{AvcNalType, BitStreamRust, BitstreamError, FRAMERATES_VALUES, SLICE_TYPES}; use lib_ccxr::util::log::DebugMessageFlag; use lib_ccxr::{debug, info}; -use std::os::raw::{c_char, c_long}; +use std::os::raw::c_char; /// Process sequence parameter set RBSP pub fn seq_parameter_set_rbsp( @@ -630,7 +630,7 @@ pub unsafe fn slice_header( msg_type = DebugMessageFlag::TIME; " sync_pts:{} ({:8})", std::ffi::CStr::from_ptr(ccxr_print_mstime_static( - ((*dec_ctx.timing).sync_pts / ((MPEG_CLOCK_FREQ as i64) / 1000i64)) as c_long, + (*dec_ctx.timing).sync_pts / ((MPEG_CLOCK_FREQ as i64) / 1000i64), buf.as_mut_ptr() )) .to_str()