From da3dc52b45dee5388cd17971f05ee6aaa5ede5fd Mon Sep 17 00:00:00 2001
From: Carlos Fernandez <carlos@ccextractor.org>
Date: Sun, 21 Dec 2025 08:05:14 +0100
Subject: [PATCH 1/3] fix(708): Support Korean EUC-KR encoding in CEA-708
 decoder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Korean broadcasts use EUC-KR encoding (variable-width) in CEA-708
captions, where ASCII is 1 byte and Korean characters are 2 bytes.
The decoder was always writing 2 bytes per character (UTF-16BE style),
causing NULL bytes to be inserted before every ASCII character.

Changes:
- Add is_utf16_charset() to detect fixed-width 16-bit encodings
- Modify write_char() to accept use_utf16 flag:
  - true: Always 2 bytes (UTF-16BE for Japanese, issue #1451)
  - false: 1 byte for ASCII, 2 bytes for extended (EUC-KR for Korean)
- Detect charset type in write_row() before building output buffer

This fixes Korean subtitle extraction when using --service "1[EUC-KR]"
while maintaining compatibility with Japanese UTF-16BE (issue #1451).

Closes #1065

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/rust/src/decoder/output.rs    | 98 ++++++++++++++++++++++++++-----
 src/rust/src/decoder/tv_screen.rs | 21 ++++++-
 2 files changed, 102 insertions(+), 17 deletions(-)
diff --git a/src/rust/src/decoder/output.rs b/src/rust/src/decoder/output.rs
index 02e0356ba..20ee03133 100644
--- a/src/rust/src/decoder/output.rs
+++ b/src/rust/src/decoder/output.rs
@@ -85,14 +85,38 @@ impl<'a> Writer<'a> {
 
 /// Write the symbol to the provided buffer
 ///
-/// Always writes 2 bytes for consistent UTF-16BE encoding.
-/// Previously, this function wrote 1 byte for ASCII characters and 2 bytes
-/// for non-ASCII, creating an invalid mix that encoding conversion couldn't
-/// handle properly. This caused garbled output with Japanese/Chinese characters
-/// (issue #1451).
-pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>) {
-    buf.push((sym.sym >> 8) as u8);
-    buf.push((sym.sym & 0xff) as u8);
+/// The `use_utf16` parameter controls the output format:
+/// - `true`: Always writes 2 bytes (UTF-16BE format). Use for UTF-16/UCS-2 charsets.
+/// - `false`: Writes 1 byte for ASCII (high byte == 0), 2 bytes for extended chars.
+///   Use for variable-width encodings like EUC-KR, CP949, Shift-JIS, etc.
+///
+/// Issue #1451: Japanese/Chinese with UTF-16BE need 2 bytes for all characters.
+/// Issue #1065: Korean with EUC-KR needs 1 byte for ASCII, 2 bytes for Korean.
+pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>, use_utf16: bool) {
+    let high = (sym.sym >> 8) as u8;
+    let low = (sym.sym & 0xff) as u8;
+
+    if use_utf16 {
+        // UTF-16BE: Always write 2 bytes
+        buf.push(high);
+        buf.push(low);
+    } else {
+        // Variable-width: Only write high byte if non-zero
+        if high != 0 {
+            buf.push(high);
+        }
+        buf.push(low);
+    }
+}
+
+/// Check if a charset name indicates UTF-16 or UCS-2 encoding
+///
+/// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes.
+pub fn is_utf16_charset(charset: &str) -> bool {
+    let upper = charset.to_uppercase();
+    upper.contains("UTF-16") || upper.contains("UTF16") ||
+    upper.contains("UCS-2") || upper.contains("UCS2") ||
+    upper.contains("UTF_16") || upper.contains("UCS_2")
 }
 
 /// Convert from CEA-708 color representation to hex code
@@ -114,27 +138,71 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_write_char() {
+    fn test_write_char_utf16_mode() {
         let mut buf = Vec::new();
 
-        // Write ASCII symbol - UTF-16BE always uses 2 bytes
-        // 'A' (0x41) becomes [0x00, 0x41] in UTF-16BE
+        // UTF-16 mode: ASCII symbol 'A' (0x41) becomes [0x00, 0x41]
         let sym = dtvcc_symbol { sym: 0x41, init: 0 };
-        write_char(&sym, &mut buf);
+        write_char(&sym, &mut buf, true);
         assert_eq!(buf, vec![0x00, 0x41]);
 
         buf.clear();
 
-        // Write non-ASCII symbol (e.g., Japanese character)
-        // Already 16-bit, writes as [high_byte, low_byte]
+        // UTF-16 mode: Non-ASCII symbol writes as [high_byte, low_byte]
         let sym = dtvcc_symbol {
             sym: 0x1234,
             init: 0,
         };
-        write_char(&sym, &mut buf);
+        write_char(&sym, &mut buf, true);
         assert_eq!(buf, vec![0x12, 0x34]);
     }
 
+    #[test]
+    fn test_write_char_variable_width_mode() {
+        let mut buf = Vec::new();
+
+        // Variable-width mode: ASCII symbol 'A' (0x41) becomes [0x41] (1 byte)
+        let sym = dtvcc_symbol { sym: 0x41, init: 0 };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0x41]);
+
+        buf.clear();
+
+        // Variable-width mode: Korean EUC-KR char becomes [high, low] (2 bytes)
+        // Example: Korean '인' = 0xC0CE in EUC-KR
+        let sym = dtvcc_symbol {
+            sym: 0xC0CE,
+            init: 0,
+        };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0xC0, 0xCE]);
+
+        buf.clear();
+
+        // Variable-width mode: Space (0x20) becomes [0x20] (1 byte, no NUL)
+        let sym = dtvcc_symbol { sym: 0x20, init: 0 };
+        write_char(&sym, &mut buf, false);
+        assert_eq!(buf, vec![0x20]);
+    }
+
+    #[test]
+    fn test_is_utf16_charset() {
+        // Should return true for UTF-16 variants
+        assert!(is_utf16_charset("UTF-16BE"));
+        assert!(is_utf16_charset("UTF-16LE"));
+        assert!(is_utf16_charset("utf-16"));
+        assert!(is_utf16_charset("UTF16"));
+        assert!(is_utf16_charset("UCS-2"));
+        assert!(is_utf16_charset("UCS2"));
+
+        // Should return false for variable-width encodings
+        assert!(!is_utf16_charset("EUC-KR"));
+        assert!(!is_utf16_charset("CP949"));
+        assert!(!is_utf16_charset("Shift-JIS"));
+        assert!(!is_utf16_charset("UTF-8"));
+        assert!(!is_utf16_charset("ISO-8859-1"));
+    }
+
     #[test]
     fn test_color_to_hex() {
         assert_eq!(color_to_hex(0b00_00_00), (0, 0, 0)); // Black
diff --git a/src/rust/src/decoder/tv_screen.rs b/src/rust/src/decoder/tv_screen.rs
index aede8e608..7ee5ec532 100644
--- a/src/rust/src/decoder/tv_screen.rs
+++ b/src/rust/src/decoder/tv_screen.rs
@@ -13,7 +13,7 @@ use std::{ffi::CStr, fs::File};
 #[cfg(windows)]
 use crate::bindings::_get_osfhandle;
 
-use super::output::{color_to_hex, write_char, Writer};
+use super::output::{color_to_hex, is_utf16_charset, write_char, Writer};
 use super::timing::{get_scc_time_str, get_time_str};
 use super::{CCX_DTVCC_SCREENGRID_COLUMNS, CCX_DTVCC_SCREENGRID_ROWS};
 use crate::{
@@ -177,6 +177,23 @@ impl dtvcc_tv_screen {
         let (first, last) = self.get_write_interval(row_index);
         debug!("First: {first}, Last: {last}");
 
+        // Determine if we should use UTF-16 mode (2 bytes for all chars) or
+        // variable-width mode (1 byte for ASCII, 2 bytes for extended chars).
+        // UTF-16/UCS-2 encodings require 2 bytes even for ASCII.
+        // Variable-width encodings (EUC-KR, CP949, Shift-JIS, etc.) use 1 byte for ASCII.
+        let use_utf16 = if !writer.writer_ctx.charset.is_null() {
+            let charset = unsafe {
+                CStr::from_ptr(writer.writer_ctx.charset)
+                    .to_str()
+                    .unwrap_or("")
+            };
+            is_utf16_charset(charset)
+        } else {
+            // No charset specified - default to variable-width for backward compatibility
+            // with raw byte output (no encoding conversion)
+            false
+        };
+
         for i in 0..last + 1 {
             if use_colors {
                 self.change_pen_color(
@@ -219,7 +236,7 @@ impl dtvcc_tv_screen {
             if i < first {
                 buf.push(b' ');
             } else {
-                write_char(&self.chars[row_index][i], &mut buf)
+                write_char(&self.chars[row_index][i], &mut buf, use_utf16)
             }
         }
         // there can be unclosed tags or colors after the last symbol in a row

From d0caf23a82af9d07c2f264860dc9d94273919e39 Mon Sep 17 00:00:00 2001
From: Carlos Fernandez <carlos@ccextractor.org>
Date: Sun, 21 Dec 2025 08:35:13 +0100
Subject: [PATCH 2/3] fix(timing): Use i64 instead of c_long for Windows
 compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Rust FFI functions were using c_long for PTS/FTS timestamps, but:
- C code uses LLONG (int64_t, 64 bits on all platforms)
- Rust c_long is 32 bits on Windows, 64 bits on Linux

This caused timestamp truncation on Windows when PTS values exceeded
2^31 (~24 days at 90kHz), resulting in wrong subtitle timestamps.

For example, a file with Min PTS of 23:50:45 (7,726,090,500 ticks)
would have its PTS truncated, breaking the teletext delta calculation
that normalizes timestamps to start at 0.

Changes:
- ccxr_add_current_pts: pts parameter i64
- ccxr_set_current_pts: pts parameter i64
- ccxr_get_fts: return type i64
- ccxr_get_visible_end: return type i64
- ccxr_get_visible_start: return type i64
- ccxr_get_fts_max: return type i64
- ccxr_print_mstime_static: mstime parameter i64
- fts_at_gop_start: extern static i64

Fixes tests 18 and 19 on Windows CI which showed raw PTS timestamps
(23:50:46) instead of normalized timestamps (00:00:00).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/rust/src/decoder/output.rs       |  9 ++++++---
 src/rust/src/es/pic.rs               |  6 +-----
 src/rust/src/lib.rs                  |  6 +++---
 src/rust/src/libccxr_exports/time.rs | 28 ++++++++++++++--------------
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/rust/src/decoder/output.rs b/src/rust/src/decoder/output.rs
index 20ee03133..7cc2b6a71 100644
--- a/src/rust/src/decoder/output.rs
+++ b/src/rust/src/decoder/output.rs
@@ -114,9 +114,12 @@ pub fn write_char(sym: &dtvcc_symbol, buf: &mut Vec<u8>, use_utf16: bool) {
 /// These are fixed-width 16-bit encodings where even ASCII needs 2 bytes.
 pub fn is_utf16_charset(charset: &str) -> bool {
     let upper = charset.to_uppercase();
-    upper.contains("UTF-16") || upper.contains("UTF16") ||
-    upper.contains("UCS-2") || upper.contains("UCS2") ||
-    upper.contains("UTF_16") || upper.contains("UCS_2")
+    upper.contains("UTF-16")
+        || upper.contains("UTF16")
+        || upper.contains("UCS-2")
+        || upper.contains("UCS2")
+        || upper.contains("UTF_16")
+        || upper.contains("UCS_2")
 }
 
 /// Convert from CEA-708 color representation to hex code
diff --git a/src/rust/src/es/pic.rs b/src/rust/src/es/pic.rs
index d9bfea66a..89c3344b9 100644
--- a/src/rust/src/es/pic.rs
+++ b/src/rust/src/es/pic.rs
@@ -277,12 +277,8 @@ pub unsafe fn read_pic_info(
     // set_fts() is not called for each picture when use_gop_as_pts == 1.
     if ccx_options.use_gop_as_pts == 1 {
         // Calculate current FTS based on GOP start time + frame offset
-        // Cast fts_at_gop_start to i64 for cross-platform compatibility (c_long is i32 on Windows)
         let frame_offset_ms = (dec_ctx.frames_since_last_gop as f64 * 1000.0 / current_fps) as i64;
-        #[allow(clippy::unnecessary_cast)]
-        {
-            (*dec_ctx.timing).fts_now = (fts_at_gop_start as i64) + frame_offset_ms;
-        }
+        (*dec_ctx.timing).fts_now = fts_at_gop_start + frame_offset_ms;
 
         // Update fts_max if needed
         if (*dec_ctx.timing).fts_now > (*dec_ctx.timing).fts_max {
diff --git a/src/rust/src/lib.rs b/src/rust/src/lib.rs
index 4e453f4e3..9f267c7b3 100644
--- a/src/rust/src/lib.rs
+++ b/src/rust/src/lib.rs
@@ -51,7 +51,7 @@ use std::os::raw::{c_uchar, c_void};
 use std::{
     ffi::CStr,
     io::Write,
-    os::raw::{c_char, c_double, c_int, c_long, c_uint},
+    os::raw::{c_char, c_double, c_int, c_uint},
 };
 
 // Mock data for rust unit tests
@@ -67,7 +67,7 @@ cfg_if! {
 
         static mut frames_since_ref_time: c_int = 0;
         static mut total_frames_count: c_uint = 0;
-        static mut fts_at_gop_start: c_long = 0;
+        static mut fts_at_gop_start: i64 = 0;
         static mut gop_rollover: c_int = 0;
         static mut pts_big_change: c_uint = 0;
 
@@ -144,7 +144,7 @@ extern "C" {
     static mut total_frames_count: c_uint;
     static mut gop_time: gop_time_code;
     static mut first_gop_time: gop_time_code;
-    static mut fts_at_gop_start: c_long;
+    static mut fts_at_gop_start: i64;
     static mut gop_rollover: c_int;
     static mut ccx_common_timing_settings: ccx_common_timing_settings_t;
     static mut capitalization_list: word_list;
diff --git a/src/rust/src/libccxr_exports/time.rs b/src/rust/src/libccxr_exports/time.rs
index c0063916f..82df6838c 100644
--- a/src/rust/src/libccxr_exports/time.rs
+++ b/src/rust/src/libccxr_exports/time.rs
@@ -1,7 +1,7 @@
 #![allow(clippy::useless_conversion)]
 
 use std::convert::TryInto;
-use std::ffi::{c_char, c_int, c_long, CStr};
+use std::ffi::{c_char, c_int, CStr};
 
 use crate::{
     bindings::*, cb_708, cb_field1, cb_field2, ccx_common_timing_settings as timing_settings,
@@ -330,7 +330,7 @@ unsafe fn write_back_from_timing_info() {
         .unwrap_or(0);
     gop_time = write_gop_time_code(timing_info.gop_time);
     first_gop_time = write_gop_time_code(timing_info.first_gop_time);
-    fts_at_gop_start = timing_info.fts_at_gop_start.millis() as c_long;
+    fts_at_gop_start = timing_info.fts_at_gop_start.millis();
     gop_rollover = if timing_info.gop_rollover { 1 } else { 0 };
     timing_settings.disable_sync_check = if timing_info.timing_settings.disable_sync_check {
         1
@@ -412,7 +412,7 @@ pub unsafe fn write_gop_time_code(g: Option<GopTimeCode>) -> gop_time_code {
 ///
 /// `ctx` must not be null.
 #[no_mangle]
-pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, pts: c_long) {
+pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, pts: i64) {
     apply_timing_info();
     let mut context = generate_timing_context(ctx);
 
@@ -428,7 +428,7 @@ pub unsafe extern "C" fn ccxr_add_current_pts(ctx: *mut ccx_common_timing_ctx, p
 ///
 /// `ctx` must not be null.
 #[no_mangle]
-pub unsafe extern "C" fn ccxr_set_current_pts(ctx: *mut ccx_common_timing_ctx, pts: c_long) {
+pub unsafe extern "C" fn ccxr_set_current_pts(ctx: *mut ccx_common_timing_ctx, pts: i64) {
     apply_timing_info();
     let mut context = generate_timing_context(ctx);
 
@@ -469,7 +469,7 @@ pub unsafe extern "C" fn ccxr_set_fts(ctx: *mut ccx_common_timing_ctx) -> c_int
 pub unsafe extern "C" fn ccxr_get_fts(
     ctx: *mut ccx_common_timing_ctx,
     current_field: c_int,
-) -> c_long {
+) -> i64 {
     apply_timing_info();
     let mut context = generate_timing_context(ctx);
 
@@ -485,7 +485,7 @@ pub unsafe extern "C" fn ccxr_get_fts(
     write_back_to_common_timing_ctx(ctx, &context);
     write_back_from_timing_info();
 
-    ans.millis().try_into().unwrap_or(0)
+    ans.millis()
 }
 
 /// Rust equivalent for `get_visible_end` function in C. Uses C-native types as input and output.
@@ -503,7 +503,7 @@ pub unsafe extern "C" fn ccxr_get_fts(
 pub unsafe extern "C" fn ccxr_get_visible_end(
     ctx: *mut ccx_common_timing_ctx,
     _current_field: c_int,
-) -> c_long {
+) -> i64 {
     apply_timing_info();
     let mut context = generate_timing_context(ctx);
 
@@ -518,7 +518,7 @@ pub unsafe extern "C" fn ccxr_get_visible_end(
     write_back_to_common_timing_ctx(ctx, &context);
     write_back_from_timing_info();
 
-    fts as c_long
+    fts
 }
 
 /// Rust equivalent for `get_visible_start` function in C. Uses C-native types as input and output.
@@ -537,7 +537,7 @@ pub unsafe extern "C" fn ccxr_get_visible_end(
 pub unsafe extern "C" fn ccxr_get_visible_start(
     ctx: *mut ccx_common_timing_ctx,
     _current_field: c_int,
-) -> c_long {
+) -> i64 {
     apply_timing_info();
     let context = generate_timing_context(ctx);
 
@@ -554,7 +554,7 @@ pub unsafe extern "C" fn ccxr_get_visible_start(
     write_back_to_common_timing_ctx(ctx, &context);
     write_back_from_timing_info();
 
-    fts as c_long
+    fts
 }
 
 /// Rust equivalent for `get_fts_max` function in C. Uses C-native types as input and output.
@@ -563,7 +563,7 @@ pub unsafe extern "C" fn ccxr_get_visible_start(
 ///
 /// `ctx` must not be null.
 #[no_mangle]
-pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_long {
+pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> i64 {
     apply_timing_info();
     let mut context = generate_timing_context(ctx);
 
@@ -572,7 +572,7 @@ pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_
     write_back_to_common_timing_ctx(ctx, &context);
     write_back_from_timing_info();
 
-    ans.millis().try_into().unwrap_or(0)
+    ans.millis()
 }
 
 /// Rust equivalent for `print_mstime_static` function in C. Uses C-native types as input and output.
@@ -581,8 +581,8 @@ pub unsafe extern "C" fn ccxr_get_fts_max(ctx: *mut ccx_common_timing_ctx) -> c_
 ///
 /// `buf` must not be null. It must have sufficient length to hold the time in string form.
 #[no_mangle]
-pub unsafe extern "C" fn ccxr_print_mstime_static(mstime: c_long, buf: *mut c_char) -> *mut c_char {
-    let time = Timestamp::from_millis(mstime.into());
+pub unsafe extern "C" fn ccxr_print_mstime_static(mstime: i64, buf: *mut c_char) -> *mut c_char {
+    let time = Timestamp::from_millis(mstime);
     let ans = c::print_mstime_static(time, ':');
     write_string_into_pointer(buf, &ans);
     buf

From 73cd19f5d0ff3286718a5de296c54c33d727443a Mon Sep 17 00:00:00 2001
From: Carlos Fernandez <carlos@ccextractor.org>
Date: Sun, 21 Dec 2025 09:30:19 +0100
Subject: [PATCH 3/3] fix(rust): Use i64 instead of c_long for Windows
 compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Windows, c_long is i32 while on Linux it's i64. The function
ccxr_print_mstime_static expects i64, so casting to c_long caused
a type mismatch error on Windows builds.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/rust/src/avc/nal.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/rust/src/avc/nal.rs b/src/rust/src/avc/nal.rs
index 4a2d2342b..767a9da0a 100644
--- a/src/rust/src/avc/nal.rs
+++ b/src/rust/src/avc/nal.rs
@@ -9,7 +9,7 @@ use crate::{ccx_options, current_fps, total_frames_count, MPEG_CLOCK_FREQ};
 use lib_ccxr::common::{AvcNalType, BitStreamRust, BitstreamError, FRAMERATES_VALUES, SLICE_TYPES};
 use lib_ccxr::util::log::DebugMessageFlag;
 use lib_ccxr::{debug, info};
-use std::os::raw::{c_char, c_long};
+use std::os::raw::c_char;
 
 /// Process sequence parameter set RBSP
 pub fn seq_parameter_set_rbsp(
@@ -630,7 +630,7 @@ pub unsafe fn slice_header(
         msg_type = DebugMessageFlag::TIME;
         "  sync_pts:{} ({:8})",
         std::ffi::CStr::from_ptr(ccxr_print_mstime_static(
-            ((*dec_ctx.timing).sync_pts / ((MPEG_CLOCK_FREQ as i64) / 1000i64)) as c_long,
+            (*dec_ctx.timing).sync_pts / ((MPEG_CLOCK_FREQ as i64) / 1000i64),
             buf.as_mut_ptr()
         ))
         .to_str()