diff --git a/README.md b/README.md index 9a18f102f..6662f7f78 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,22 @@ The core functionality is written in C. Other languages used include C++ and Pyt Downloads for precompiled binaries and source code can be found [on our website](https://ccextractor.org/public/general/downloads/). +### WebVTT Output Options + +CCExtractor supports optional WebVTT-specific headers for advanced use cases +such as HTTP Live Streaming (HLS). + +#### `--timestamp-map` + +Enable writing the `X-TIMESTAMP-MAP` header in WebVTT output. + +This header is required for HLS workflows but is **disabled by default** +to preserve compatibility with standard WebVTT players. + +Example: +```bash +ccextractor input.ts --timestamp-map -o output.vtt + ### Windows Package Managers diff --git a/src/lib_ccx/ccx_common_option.c b/src/lib_ccx/ccx_common_option.c index ecfe17f7f..d1e6361f2 100644 --- a/src/lib_ccx/ccx_common_option.c +++ b/src/lib_ccx/ccx_common_option.c @@ -3,6 +3,8 @@ #include "ccx_decoders_708.h" #include "utility.h" + + extern ccx_encoders_transcript_format ccx_encoders_default_transcript_settings; /* Parameters */ void init_options(struct ccx_s_options *options) @@ -41,6 +43,7 @@ void init_options(struct ccx_s_options *options) options->live_stream = 0; // 0 -> A regular file options->messages_target = 1; // 1=stdout options->print_file_reports = 0; + options->report_format = REPORT_FORMAT_TEXT; options->timestamp_map = 0; // Disable X-TIMESTAMP-MAP header by default /* Levenshtein's parameters, for string comparison */ diff --git a/src/lib_ccx/ccx_common_option.h b/src/lib_ccx/ccx_common_option.h index aa7e14207..c1a54f847 100644 --- a/src/lib_ccx/ccx_common_option.h +++ b/src/lib_ccx/ccx_common_option.h @@ -90,8 +90,16 @@ struct encoder_cfg int extract_only_708; // 1 if only 708 subs extraction is enabled }; +typedef enum +{ + REPORT_FORMAT_TEXT = 0, + REPORT_FORMAT_JSON = 1 +} report_format_t; + + struct ccx_s_options // Options from user parameters { + report_format_t report_format; int extract; // Extract 1st, 2nd or both fields int no_rollup; // Disable roll-up emulation (no duplicate output in generated file) int noscte20; @@ -201,7 +209,8 @@ struct ccx_s_options // Options from user parameters int multiprogram; int out_interval; int segment_on_key_frames_only; - int scc_framerate; // SCC input framerate: 0=29.97 (default), 1=24, 2=25, 3=30 + int scc_framerate; + // SCC input framerate: 0=29.97 (default), 1=24, 2=25, 3=30 #ifdef WITH_LIBCURL char *curlposturl; #endif diff --git a/src/lib_ccx/ccx_decoders_608.c b/src/lib_ccx/ccx_decoders_608.c index 63d73ec01..b93c3a2d4 100644 --- a/src/lib_ccx/ccx_decoders_608.c +++ b/src/lib_ccx/ccx_decoders_608.c @@ -1292,7 +1292,8 @@ int process608(const unsigned char *data, int length, void *private_data, struct } handle_single(hi, context); - handle_single(lo, context); + if (lo >= 0x20) // ADD THIS CHECK - only write lo if it's a printable character + handle_single(lo, context); wrote_to_screen = 1; context->last_c1 = 0; context->last_c2 = 0; diff --git a/src/lib_ccx/lib_ccx.h b/src/lib_ccx/lib_ccx.h index 4a07a4f0d..f89fd6cc8 100644 --- a/src/lib_ccx/lib_ccx.h +++ b/src/lib_ccx/lib_ccx.h @@ -230,6 +230,7 @@ void process_hdcc(struct encoder_ctx *enc_ctx, struct lib_cc_decode *ctx, struct // params_dump.c void params_dump(struct lib_ccx_ctx *ctx); void print_file_report(struct lib_ccx_ctx *ctx); +void print_file_report_json(struct lib_ccx_ctx *ctx); // output.c void dinit_write(struct ccx_s_write *wb); diff --git a/src/lib_ccx/params_dump.c b/src/lib_ccx/params_dump.c index f679eea73..b81c03534 100644 --- a/src/lib_ccx/params_dump.c +++ b/src/lib_ccx/params_dump.c @@ -258,6 +258,10 @@ void print_cc_report(struct lib_ccx_ctx *ctx, struct cap_info *info) void print_file_report(struct lib_ccx_ctx *ctx) { + if (ccx_options.report_format == REPORT_FORMAT_JSON) { + print_file_report_json(ctx); + return; + } struct lib_cc_decode *dec_ctx = NULL; struct ccx_demuxer *demux_ctx = ctx->demux_ctx; @@ -426,3 +430,496 @@ void print_file_report(struct lib_ccx_ctx *ctx) memset(&ctx->freport, 0, sizeof(struct file_report)); #undef Y_N } + +// Helper function to escape JSON strings +static void json_escape_string(const char *str, char *out, size_t out_size) +{ + size_t pos = 0; + if (!str) + { + out[0] = '\0'; + return; + } + for (const char *p = str; *p && pos < out_size - 1; p++) + { + switch (*p) + { + case '"': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = '"'; + } + break; + case '\\': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = '\\'; + } + break; + case '\b': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = 'b'; + } + break; + case '\f': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = 'f'; + } + break; + case '\n': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = 'n'; + } + break; + case '\r': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = 'r'; + } + break; + case '\t': + if (pos + 2 < out_size) + { + out[pos++] = '\\'; + out[pos++] = 't'; + } + break; + default: + if ((unsigned char)*p < 0x20) + { + // Control character - escape as \uXXXX + if (pos + 6 < out_size) + { + snprintf(out + pos, out_size - pos, "\\u%04x", (unsigned char)*p); + pos += 6; + } + } + else + { + out[pos++] = *p; + } + break; + } + } + out[pos] = '\0'; +} + +// Helper function to get stream mode as string +static const char *get_stream_mode_string(enum ccx_stream_mode_enum mode) +{ + switch (mode) + { + case CCX_SM_TRANSPORT: + return "Transport Stream"; + case CCX_SM_PROGRAM: + return "Program Stream"; + case CCX_SM_ASF: + return "ASF"; + case CCX_SM_WTV: + return "WTV"; + case CCX_SM_ELEMENTARY_OR_NOT_FOUND: + return "Not Found"; + case CCX_SM_MP4: + return "MP4"; + case CCX_SM_MCPOODLESRAW: + return "McPoodle's raw"; + case CCX_SM_RCWT: + return "BIN"; +#ifdef WTV_DEBUG + case CCX_SM_HEX_DUMP: + return "Hex"; +#endif + default: + return "Unknown"; + } +} + +void print_file_report_json(struct lib_ccx_ctx *ctx) +{ + struct lib_cc_decode *dec_ctx = NULL; + struct ccx_demuxer *demux_ctx = ctx->demux_ctx; + char escaped_str[4096]; + int first_item = 1; + + // Early return if file not opened + if (ccx_options.input_source == CCX_DS_FILE) + { + if (ctx->current_file < 0 || ctx->current_file >= ctx->num_input_files) + { + printf("{\"error\":\"file is not opened yet\"}\n"); + return; + } + } + + printf("{\n"); + + // File/Source + printf(" \"file\": \""); + switch (ccx_options.input_source) + { + case CCX_DS_FILE: + json_escape_string(ctx->inputfile[ctx->current_file], escaped_str, sizeof(escaped_str)); + printf("%s", escaped_str); + break; + case CCX_DS_STDIN: + printf("stdin"); + break; + case CCX_DS_TCP: + case CCX_DS_NETWORK: + printf("network"); + break; + } + printf("\",\n"); + + // Stream Mode + printf(" \"stream_mode\": \"%s\"", get_stream_mode_string(demux_ctx->stream_mode)); + first_item = 0; + + // Transport Stream specific fields + if (demux_ctx->stream_mode == CCX_SM_TRANSPORT) + { + printf(",\n \"program_count\": %u", demux_ctx->freport.program_cnt); + + // Program Numbers + printf(",\n \"program_numbers\": ["); + for (int i = 0; i < demux_ctx->nb_program; i++) + { + if (i > 0) + printf(", "); + printf("%u", demux_ctx->pinfo[i].program_number); + } + printf("]"); + + // DVB Subtitle PIDs + printf(",\n \"dvb_subtitle_pids\": ["); + int first_dvb = 1; + for (int j = 0; j < SUB_STREAMS_CNT; j++) + { + if (demux_ctx->freport.dvb_sub_pid[j] != 0) + { + if (!first_dvb) + printf(", "); + printf("%u", demux_ctx->freport.dvb_sub_pid[j]); + first_dvb = 0; + } + } + printf("]"); + + // Teletext Subtitle PIDs + printf(",\n \"teletext_subtitle_pids\": ["); + int first_tlt = 1; + for (int j = 0; j < SUB_STREAMS_CNT; j++) + { + if (demux_ctx->freport.tlt_sub_pid[j] != 0) + { + if (!first_tlt) + printf(", "); + printf("%u", demux_ctx->freport.tlt_sub_pid[j]); + first_tlt = 0; + } + } + printf("]"); + + // PIDs list + printf(",\n \"pids\": ["); + int first_pid = 1; + for (int i = 0; i < 65536; i++) + { + if (demux_ctx->PIDs_programs[i] == 0) + continue; + + if (!first_pid) + printf(", "); + printf("\n {\"pid\": %u, \"program\": %u, \"stream_type\": \"", i, + demux_ctx->PIDs_programs[i]->program_number); + + int is_dvb = 0, is_tlt = 0; + for (int j = 0; j < SUB_STREAMS_CNT; j++) + { + if (demux_ctx->freport.dvb_sub_pid[j] == i) + { + printf("DVB Subtitles"); + is_dvb = 1; + break; + } + if (demux_ctx->freport.tlt_sub_pid[j] == i) + { + printf("Teletext Subtitles"); + is_tlt = 1; + break; + } + } + if (!is_dvb && !is_tlt) + { + const char *stream_desc = desc[demux_ctx->PIDs_programs[i]->printable_stream_type]; + if (stream_desc) + { + json_escape_string(stream_desc, escaped_str, sizeof(escaped_str)); + printf("%s", escaped_str); + } + else + { + printf("Unknown"); + } + } + printf("\"}"); + first_pid = 0; + } + printf("\n ]"); + } + + // Programs array + printf(",\n \"programs\": ["); + struct cap_info *program; + int first_program = 1; + if (list_empty(&demux_ctx->cinfo_tree.all_stream)) + { + // No programs, but we still need to output EIA-608/CEA-708 if available + dec_ctx = update_decoder_list_cinfo(ctx, NULL); + if (dec_ctx && (dec_ctx->cc_stats[0] > 0 || dec_ctx->cc_stats[1] > 0 || + dec_ctx->cc_stats[2] > 0 || dec_ctx->cc_stats[3] > 0)) + { + printf("\n {"); + printf("\n \"atsc_closed_caption\": {"); + if (dec_ctx->cc_stats[0] > 0 || dec_ctx->cc_stats[1] > 0) + { + printf("\n \"eia608\": {"); + printf("\n \"present\": true"); + if (ctx->freport.data_from_608) + { + printf(",\n \"xds\": %s", + ctx->freport.data_from_608->xds ? "true" : "false"); + printf(",\n \"cc1\": %s", + ctx->freport.data_from_608->cc_channels[0] ? "true" : "false"); + printf(",\n \"cc2\": %s", + ctx->freport.data_from_608->cc_channels[1] ? "true" : "false"); + printf(",\n \"cc3\": %s", + ctx->freport.data_from_608->cc_channels[2] ? "true" : "false"); + printf(",\n \"cc4\": %s", + ctx->freport.data_from_608->cc_channels[3] ? "true" : "false"); + } + printf("\n }"); + } + else + { + printf("\n \"eia608\": {\"present\": false}"); + } + if (dec_ctx->cc_stats[2] > 0 || dec_ctx->cc_stats[3] > 0) + { + printf(","); + printf("\n \"cea708\": {"); + printf("\n \"present\": true"); + if (ctx->freport.data_from_708) + { + printf(",\n \"services\": ["); + int first_svc = 1; + for (int i = 0; i < CCX_DTVCC_MAX_SERVICES; i++) + { + if (ctx->freport.data_from_708->services[i] != 0) + { + if (!first_svc) + printf(", "); + printf("%d", i); + first_svc = 0; + } + } + printf("]"); + printf(",\n \"primary_language_present\": %s", + ctx->freport.data_from_708->services[1] ? "true" : "false"); + printf(",\n \"secondary_language_present\": %s", + ctx->freport.data_from_708->services[2] ? "true" : "false"); + } + printf("\n }"); + } + else + { + printf(","); + printf("\n \"cea708\": {\"present\": false}"); + } + printf("\n }"); + printf("\n }"); + } + } + + list_for_each_entry(program, &demux_ctx->cinfo_tree.pg_stream, pg_stream, struct cap_info) + { + if (!first_program) + printf(","); + printf("\n {"); + printf("\n \"program_number\": %u", program->program_number); + + // DVB Subtitles + struct cap_info *info = get_sib_stream_by_type(program, CCX_CODEC_DVB); + printf(",\n \"dvb_subtitles\": %s", info ? "true" : "false"); + + // Teletext + info = get_sib_stream_by_type(program, CCX_CODEC_TELETEXT); + printf(",\n \"teletext\": {"); + if (info) + { + printf("\n \"present\": true"); + dec_ctx = update_decoder_list_cinfo(ctx, info); + if (dec_ctx && dec_ctx->codec == CCX_CODEC_TELETEXT) + { + struct TeletextCtx *tlt_ctx = dec_ctx->private_data; + if (tlt_ctx) + { + printf(",\n \"pages_with_subtitles\": ["); + int first_page = 1; + for (int i = 0; i < MAX_TLT_PAGES; i++) + { + if (tlt_ctx->seen_sub_page[i] != 0) + { + if (!first_page) + printf(", "); + printf("%d", i); + first_page = 0; + } + } + printf("]"); + } + } + } + else + { + printf("\n \"present\": false"); + } + printf("\n }"); + + // ATSC Closed Caption + info = get_sib_stream_by_type(program, CCX_CODEC_ATSC_CC); + printf(",\n \"atsc_closed_caption\": {"); + if (info) + { + printf("\n \"present\": true"); + dec_ctx = update_decoder_list_cinfo(ctx, info); + if (dec_ctx) + { + if (dec_ctx->cc_stats[0] > 0 || dec_ctx->cc_stats[1] > 0) + { + printf(","); + printf("\n \"eia608\": {"); + printf("\n \"present\": true"); + if (ctx->freport.data_from_608) + { + printf(",\n \"xds\": %s", + ctx->freport.data_from_608->xds ? "true" : "false"); + printf(",\n \"cc1\": %s", + ctx->freport.data_from_608->cc_channels[0] ? "true" : "false"); + printf(",\n \"cc2\": %s", + ctx->freport.data_from_608->cc_channels[1] ? "true" : "false"); + printf(",\n \"cc3\": %s", + ctx->freport.data_from_608->cc_channels[2] ? "true" : "false"); + printf(",\n \"cc4\": %s", + ctx->freport.data_from_608->cc_channels[3] ? "true" : "false"); + } + printf("\n }"); + } + else + { + printf(","); + printf("\n \"eia608\": {\"present\": false}"); + } + if (dec_ctx->cc_stats[2] > 0 || dec_ctx->cc_stats[3] > 0) + { + printf(","); + printf("\n \"cea708\": {"); + printf("\n \"present\": true"); + if (ctx->freport.data_from_708) + { + printf(",\n \"services\": ["); + int first_svc = 1; + for (int i = 0; i < CCX_DTVCC_MAX_SERVICES; i++) + { + if (ctx->freport.data_from_708->services[i] != 0) + { + if (!first_svc) + printf(", "); + printf("%d", i); + first_svc = 0; + } + } + printf("]"); + printf(",\n \"primary_language_present\": %s", + ctx->freport.data_from_708->services[1] ? "true" : "false"); + printf(",\n \"secondary_language_present\": %s", + ctx->freport.data_from_708->services[2] ? "true" : "false"); + } + printf("\n }"); + } + else + { + printf(","); + printf("\n \"cea708\": {\"present\": false}"); + } + } + } + else + { + printf("\n \"present\": false"); + } + printf("\n }"); + + // Video properties + info = get_best_sib_stream(program); + if (info) + { + dec_ctx = update_decoder_list_cinfo(ctx, info); + if (dec_ctx && dec_ctx->in_bufferdatatype == CCX_PES && + (demux_ctx->stream_mode == CCX_SM_TRANSPORT || + demux_ctx->stream_mode == CCX_SM_PROGRAM || + demux_ctx->stream_mode == CCX_SM_ASF || + demux_ctx->stream_mode == CCX_SM_WTV)) + { + printf(",\n \"video\": {"); + printf("\n \"width\": %u", dec_ctx->current_hor_size); + printf(",\n \"height\": %u", dec_ctx->current_vert_size); + const char *aspect_str = aspect_ratio_types[dec_ctx->current_aspect_ratio]; + if (aspect_str) + { + json_escape_string(aspect_str, escaped_str, sizeof(escaped_str)); + printf(",\n \"aspect_ratio\": \"%s\"", escaped_str); + } + else + { + printf(",\n \"aspect_ratio\": \"Unknown\""); + } + const char *fps_str = framerates_types[dec_ctx->current_frame_rate]; + if (fps_str) + { + json_escape_string(fps_str, escaped_str, sizeof(escaped_str)); + printf(",\n \"frame_rate\": \"%s\"", escaped_str); + } + else + { + printf(",\n \"frame_rate\": \"Unknown\""); + } + printf("\n }"); + } + } + + printf("\n }"); + first_program = 0; + } + printf("\n ]"); + + // MPEG-4 Timed Text + printf(",\n \"mpeg4_timed_text\": {"); + printf("\n \"present\": %s", ctx->freport.mp4_cc_track_cnt > 0 ? "true" : "false"); + if (ctx->freport.mp4_cc_track_cnt > 0) + { + printf(",\n \"track_count\": %u", ctx->freport.mp4_cc_track_cnt); + } + printf("\n }"); + + printf("\n}\n"); +} diff --git a/src/lib_ccx/telxcc.c b/src/lib_ccx/telxcc.c index bcb0a051f..a49cbfa21 100644 --- a/src/lib_ccx/telxcc.c +++ b/src/lib_ccx/telxcc.c @@ -884,11 +884,48 @@ void process_page(struct TeletextCtx *ctx, teletext_page_t *page, struct cc_subt } else { - // OK, the old and new buffer don't match. So write the old - telxcc_dump_prev_page(ctx, sub); - ctx->prev_hide_timestamp = page->hide_timestamp; - ctx->prev_show_timestamp = page->show_timestamp; + // Instead of dumping immediately, check if sentence ended + if (ctx->page_buffer_prev && ctx->page_buffer_cur) + { + int len = strlen(ctx->page_buffer_prev); + + // Skip trailing spaces/newlines + while (len > 0 && + (ctx->page_buffer_prev[len - 1] == ' ' || + ctx->page_buffer_prev[len - 1] == '\n' || + ctx->page_buffer_prev[len - 1] == '\r')) + { + len--; + } + + if (len > 0) + { + char last = ctx->page_buffer_prev[len - 1]; + + // Only flush if sentence looks complete + if (last == '.' || last == '?' || last == '!' || last == ':') + { + telxcc_dump_prev_page(ctx, sub); + ctx->prev_hide_timestamp = page->hide_timestamp; + ctx->prev_show_timestamp = page->show_timestamp; + } + else + { + // Merge current fragment into previous buffer + page_buffer_add_string(ctx, " "); + page_buffer_add_string(ctx, ctx->page_buffer_cur); + + ctx->prev_hide_timestamp = page->hide_timestamp; + ctx->prev_show_timestamp = page->show_timestamp; + + // Do not dump yet, just continue without flushing + break; + } + } + } } + + break; default: add_cc_sub_text(sub, ctx->page_buffer_cur, page->show_timestamp, diff --git a/src/rust/src/parser.rs b/src/rust/src/parser.rs index 47573a996..4e4873296 100644 --- a/src/rust/src/parser.rs +++ b/src/rust/src/parser.rs @@ -220,6 +220,17 @@ impl OptionsExt for Options { fn set_output_format(&mut self, args: &Args) { self.write_format_rewritten = true; + if let Some(ref raw_out) = args.raw_out { + if raw_out == "report=json" { + self.write_format = OutputFormat::Null; + self.messages_target = OutputTarget::Quiet; + self.print_file_reports = true; + self.report_format = ReportFormat::Json; + self.demux_cfg.ts_allprogram = true; + return; + } + } + if self.send_to_srv && args.out.unwrap_or(OutFormat::Null) != OutFormat::Bin { println!("Output format is changed to bin\n"); self.set_output_format_type(OutFormat::Bin);