Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/lib_ccx/ccx_common_option.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ void init_options(struct ccx_s_options *options)
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
options->ocr_quantmode = 0; // No quantization (better OCR accuracy for DVB subtitles)
options->ocr_line_split = 0; // By default, don't split images into lines (pending testing)
options->ocr_blacklist = 1; // By default, use character blacklist to prevent common OCR errors (| vs I, etc.)
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
options->analyze_video_stream = 0;
Expand Down
2 changes: 2 additions & 0 deletions src/lib_ccx/ccx_common_option.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ struct ccx_s_options // Options from user parameters
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
int ocr_line_split; // If 1, split images into lines before OCR (uses PSM 7 for better accuracy)
int ocr_blacklist; // If 1, use character blacklist to prevent common OCR errors (default: enabled)
char *mkvlang; // The name of the language stream for MKV
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.

Expand Down
279 changes: 279 additions & 0 deletions src/lib_ccx/ocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,13 @@ void *init_ocr(int lang_index)
// set PSM mode
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);

// Set character blacklist to prevent common OCR errors (e.g. | vs I)
// These characters are rarely used in subtitles but often misrecognized
if (ccx_options.ocr_blacklist)
{
TessBaseAPISetVariable(ctx->api, "tessedit_char_blacklist", "|\\`_~");
}

free(pars_vec);
free(pars_values);

Expand Down Expand Up @@ -351,6 +358,176 @@ BOX *ignore_alpha_at_edge(png_byte *alpha, unsigned char *indata, int w, int h,
return cropWindow;
}

/**
* Structure to hold the vertical boundaries of a detected text line.
*/
struct line_bounds
{
int start_y; // Top row of line (inclusive)
int end_y; // Bottom row of line (inclusive)
};

/**
* Detects horizontal text line boundaries in a bitmap by finding rows of
* fully transparent pixels that separate lines of text.
*
* @param alpha Palette alpha values (indexed by pixel value)
* @param indata Bitmap pixel data (palette indices, w*h bytes)
* @param w Image width
* @param h Image height
* @param lines Output: allocated array of line boundaries (caller must free)
* @param num_lines Output: number of lines found
* @param min_gap Minimum consecutive transparent rows to count as line separator
* @return 0 on success, -1 on failure
*/
static int detect_text_lines(png_byte *alpha, unsigned char *indata,
int w, int h,
struct line_bounds **lines, int *num_lines,
int min_gap)
{
if (!alpha || !indata || !lines || !num_lines || w <= 0 || h <= 0)
return -1;

*lines = NULL;
*num_lines = 0;

// Allocate array to track which rows have visible content
int *row_has_content = (int *)malloc(h * sizeof(int));
if (!row_has_content)
return -1;

// Scan each row to determine if it has any visible (non-transparent) pixels
for (int i = 0; i < h; i++)
{
row_has_content[i] = 0;
for (int j = 0; j < w; j++)
{
int index = indata[i * w + j];
if (alpha[index] != 0)
{
row_has_content[i] = 1;
break; // Found visible pixel, no need to check rest of row
}
}
}

// Count lines by finding runs of content rows separated by gaps
int max_lines = (h / 2) + 1; // Conservative upper bound
struct line_bounds *temp_lines = (struct line_bounds *)malloc(max_lines * sizeof(struct line_bounds));
if (!temp_lines)
{
free(row_has_content);
return -1;
}

int line_count = 0;
int in_line = 0;
int line_start = 0;
int gap_count = 0;

for (int i = 0; i < h; i++)
{
if (row_has_content[i])
{
if (!in_line)
{
// Start of a new line
line_start = i;
in_line = 1;
}
gap_count = 0;
}
else
{
if (in_line)
{
gap_count++;
if (gap_count >= min_gap)
{
// End of line found (gap is large enough)
if (line_count < max_lines)
{
temp_lines[line_count].start_y = line_start;
temp_lines[line_count].end_y = i - gap_count;
line_count++;
}
in_line = 0;
gap_count = 0;
}
}
}
}

// Handle last line if we ended while still in a line
if (in_line && line_count < max_lines)
{
temp_lines[line_count].start_y = line_start;
// Find the last row with content
int last_content = h - 1;
while (last_content > line_start && !row_has_content[last_content])
last_content--;
temp_lines[line_count].end_y = last_content;
line_count++;
}

free(row_has_content);

if (line_count == 0)
{
free(temp_lines);
return -1;
}

// Shrink allocation to actual size
*lines = (struct line_bounds *)realloc(temp_lines, line_count * sizeof(struct line_bounds));
if (!*lines)
{
*lines = temp_lines; // Keep original if realloc fails
}
*num_lines = line_count;

return 0;
}

/**
* Performs OCR on a single text line image using PSM 7 (single line mode).
*
* @param ctx OCR context (contains Tesseract API)
* @param line_pix Pre-processed PIX for single line (grayscale, inverted)
* @return Recognized text (caller must free with free()), or NULL on failure
*/
static char *ocr_single_line(struct ocrCtx *ctx, PIX *line_pix)
{
if (!ctx || !ctx->api || !line_pix)
return NULL;

// Save current PSM
int saved_psm = TessBaseAPIGetPageSegMode(ctx->api);

// Set PSM 7 for single line recognition
TessBaseAPISetPageSegMode(ctx->api, 7); // PSM_SINGLE_LINE

// Perform OCR
TessBaseAPISetImage2(ctx->api, line_pix);
BOOL ret = TessBaseAPIRecognize(ctx->api, NULL);

char *text = NULL;
if (!ret)
{
char *tess_text = TessBaseAPIGetUTF8Text(ctx->api);
if (tess_text)
{
text = strdup(tess_text);
TessDeleteText(tess_text);
}
}

// Restore original PSM
TessBaseAPISetPageSegMode(ctx->api, saved_psm);

return text;
}

void debug_tesseract(struct ocrCtx *ctx, char *dump_path)
{
#ifdef OCR_DEBUG
Expand Down Expand Up @@ -397,6 +574,8 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
unsigned int *data, *ppixel;
BOOL tess_ret = FALSE;
struct ocrCtx *ctx = arg;
char *combined_text = NULL; // Used by line-split mode
size_t combined_len = 0; // Used by line-split mode
pix = pixCreate(w, h, 32);
color_pix = pixCreate(w, h, 32);
if (pix == NULL || color_pix == NULL)
Expand Down Expand Up @@ -476,6 +655,98 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
return NULL;
}

// Line splitting mode: detect lines and OCR each separately with PSM 7
if (ccx_options.ocr_line_split && h > 30)
{
struct line_bounds *lines = NULL;
int num_lines = 0;

// Use min_gap of 3 rows to detect line boundaries
if (detect_text_lines(alpha, indata, w, h, &lines, &num_lines, 3) == 0 && num_lines > 1)
{
// Multiple lines detected - process each separately with PSM 7
// (combined_text and combined_len are declared at function scope)

for (int line_idx = 0; line_idx < num_lines; line_idx++)
{
int line_h = lines[line_idx].end_y - lines[line_idx].start_y + 1;
if (line_h <= 0)
continue;

// Extract line region from the grayscale image
BOX *line_box = boxCreate(0, lines[line_idx].start_y,
pixGetWidth(cpix_gs), line_h);
PIX *line_pix_raw = pixClipRectangle(cpix_gs, line_box, NULL);
boxDestroy(&line_box);

if (line_pix_raw)
{
// Add white padding around the line (helps Tesseract with edge characters)
// The image is inverted (dark text on light bg), so add white (255) border
int padding = 10;
PIX *line_pix = pixAddBorderGeneral(line_pix_raw, padding, padding, padding, padding, 255);
pixDestroy(&line_pix_raw);
if (!line_pix)
continue;
char *line_text = ocr_single_line(ctx, line_pix);
pixDestroy(&line_pix);

if (line_text)
{
// Trim trailing whitespace from line
size_t line_len = strlen(line_text);
while (line_len > 0 && (line_text[line_len - 1] == '\n' ||
line_text[line_len - 1] == '\r' ||
line_text[line_len - 1] == ' '))
{
line_text[--line_len] = '\0';
}

if (line_len > 0)
{
// Append to combined result
size_t new_len = combined_len + line_len + 2; // +1 for newline, +1 for null
char *new_combined = (char *)realloc(combined_text, new_len);
if (new_combined)
{
combined_text = new_combined;
if (combined_len > 0)
{
combined_text[combined_len++] = '\n';
}
strcpy(combined_text + combined_len, line_text);
combined_len += line_len;
}
}
free(line_text);
}
}
}

free(lines);

if (combined_text && combined_len > 0)
{
// Successfully processed lines - skip whole-image OCR
// but continue to color detection below
goto line_split_color_detection;
}

// If we got here, line splitting didn't produce results
// Fall through to whole-image OCR
if (combined_text)
free(combined_text);
combined_text = NULL;
}
else
{
// Line detection failed or only 1 line - fall through to whole-image OCR
if (lines)
free(lines);
}
}

// Standard whole-image OCR path
TessBaseAPISetImage2(ctx->api, cpix_gs);
tess_ret = TessBaseAPIRecognize(ctx->api, NULL);
debug_tesseract(ctx, "./temp/");
Expand Down Expand Up @@ -518,6 +789,14 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
fatal(EXIT_NOT_ENOUGH_MEMORY, "In ocr_bitmap: Out of memory allocating text_out.");
}

// Jump target for line-split mode: use combined_text and continue with color detection
if (0)
{
line_split_color_detection:
text_out = combined_text;
combined_text = NULL; // Transfer ownership
}

// Begin color detection
// Using tlt_config.nofontcolor or ccx_options.nofontcolor (true when "--no-fontcolor" parameter used) to skip color detection if not required
// This is also skipped if --no-spupngocr is set since the OCR output won't be used anyway
Expand Down
7 changes: 7 additions & 0 deletions src/lib_ccx/params.c
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,13 @@ void print_usage(void)
mprint(" 12 Sparse text with OSD.\n");
mprint(" 13 Raw line. Treat the image as a single text line,\n");
mprint(" bypassing hacks that are Tesseract-specific.\n");
mprint(" --ocr-line-split: Split subtitle images into lines before OCR.\n");
mprint(" Uses PSM 7 (single text line mode) for each line,\n");
mprint(" which can improve accuracy for multi-line bitmap subtitles\n");
mprint(" (VOBSUB, DVD, DVB).\n");
mprint(" --no-ocr-blacklist: Disable the OCR character blacklist. By default,\n");
mprint(" CCExtractor blacklists characters like |, \\, `, _, ~\n");
mprint(" that are commonly misrecognized (e.g. 'I' as '|').\n");
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
mprint(" stream will be processed. e.g. 'eng' for English.\n");
mprint(" Language codes can be either the 3 letters bibliographic\n");
Expand Down
6 changes: 6 additions & 0 deletions src/rust/lib_ccxr/src/common/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ pub struct Options {
/// (0 = no quantization at all, 1 = CCExtractor's internal,
/// 2 = reduce distinct color count in image for faster results.)
pub ocr_quantmode: u8,
/// If true, split images into lines before OCR (uses PSM 7 for better accuracy)
pub ocr_line_split: bool,
/// If true, use character blacklist to prevent common OCR errors (e.g. | vs I)
pub ocr_blacklist: bool,
/// The name of the language stream for MKV
pub mkvlang: Option<Language>,
/// If true, the video stream will be processed even if we're using a different one for subtitles.
Expand Down Expand Up @@ -584,6 +588,8 @@ impl Default for Options {
ocr_oem: -1,
psm: 3,
ocr_quantmode: 0, // No quantization - better OCR accuracy for DVB subtitles
ocr_line_split: false, // Don't split images into lines by default
ocr_blacklist: true, // Use character blacklist by default to prevent | vs I errors
mkvlang: Default::default(),
analyze_video_stream: Default::default(),
hardsubx_ocr_mode: Default::default(),
Expand Down
12 changes: 12 additions & 0 deletions src/rust/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,18 @@ pub struct Args {
/// bypassing hacks that are Tesseract-specific.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub psm: Option<u8>,
/// Split subtitle images into lines before OCR.
/// Uses PSM 7 (single text line mode) for each line,
/// which can improve accuracy for multi-line bitmap subtitles
/// (VOBSUB, DVD, DVB).
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub ocr_line_split: bool,
/// Disable the OCR character blacklist.
/// By default, CCExtractor blacklists characters like |, \, `, _
/// that are commonly misrecognized (e.g. 'I' as '|').
/// Use this flag to disable the blacklist.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub no_ocr_blacklist: bool,
/// For MKV subtitles, select which language's caption
/// stream will be processed. e.g. 'eng' for English.
/// Language codes can be either the 3 letters bibliographic
Expand Down
Loading
Loading