commoncrawl · malteos · Jun 12, 2026 · Jun 12, 2026
diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -12,8 +12,8 @@
 from cdx_toolkit.filter_cdx.command import run_filter_cdx
 from cdx_toolkit.filter_cdx.args import add_filter_cdx_args
 
-from cdx_toolkit.filter_warc.command import run_warcer_by_cdx
-from cdx_toolkit.filter_warc.args import add_warcer_by_cdx_args
+from cdx_toolkit.filter_warc.command import run_repackage
+from cdx_toolkit.filter_warc.args import add_repackage_args
 
 
 LOGGER = logging.getLogger(__name__)
@@ -124,12 +124,12 @@ def main(args=None):
     warc.add_argument('url')
     warc.set_defaults(func=warcer)
 
-    warc_by_cdx = subparsers.add_parser(
-        'warc_by_cdx',
-        help='iterate over capture content based on an CDX index file, creating a warc'
+    repackage = subparsers.add_parser(
+        'repackage',
+        help='repackage WARC ranges from a CDX/SQL/CSV source into a new WARC'
     )
-    add_warcer_by_cdx_args(warc_by_cdx)
-    warc_by_cdx.set_defaults(func=run_warcer_by_cdx)
+    add_repackage_args(repackage)
+    repackage.set_defaults(func=run_repackage)
 
     filter_cdx = subparsers.add_parser('filter_cdx', help='Filter CDX files based on SURT prefixes whitelist')
     add_filter_cdx_args(filter_cdx)

diff --git a/cdx_toolkit/filter_warc/args.py b/cdx_toolkit/filter_warc/args.py
@@ -5,59 +5,109 @@
 logger = logging.getLogger(__name__)
 
 
-def add_warcer_by_cdx_args(parser: argparse.ArgumentParser):
+def add_repackage_args(parser: argparse.ArgumentParser):
+    # --- CDX source ---
     parser.add_argument(
         '--cdx-path',
         type=str,
         default=None,
-        help='Path to CDX index file (local or remote, e.g. S3). Required if target source is set to `cdx`.',
+        help='Path to CDX index file (local or remote, e.g. S3). Used when --target-source cdx.',
     )
     parser.add_argument(
         '--cdx-glob',
         type=str,
         default=None,
         help='a glob pattern for read from multiple CDX indices',
     )
+    # --- SQL source (--target-source sql --engine athena|duckdb) ---
     parser.add_argument(
-        '--athena-hostnames',
+        '--engine',
         type=str,
-        nargs="+",
         default=None,
-        help=('Hostnames to filter for via Athena (whitelist). Use this OR --athena-query/'
-              '--athena-query-file (mutually exclusive) when target source is `athena`.'),
+        choices=['athena', 'duckdb'],
+        help='SQL engine for the columnar index. Required when --target-source sql.',
     )
     parser.add_argument(
-        '--athena-query',
+        '--hostnames',
         type=str,
+        nargs='+',
         default=None,
-        help=('Raw Athena SQL to run instead of the hostname-based query (power users). The query '
-              'must SELECT the columns warc_filename, warc_record_offset, warc_record_length. '
-              'Mutually exclusive with --athena-hostnames and --athena-query-file.'),
+        help=('Exact hostnames (url_host_name, e.g. www.example.com) to filter for via the SQL '
+              'index. Combine with --domains; mutually exclusive with --query/--query-file. '
+              'Combine with the global --crawl to restrict the scan to specific crawls '
+              '(strongly recommended for cost).'),
     )
     parser.add_argument(
-        '--athena-query-file',
+        '--domains',
         type=str,
+        nargs='+',
         default=None,
-        help='Path to a file containing the raw Athena SQL (alternative to --athena-query).',
+        help=('Registered domains (url_host_registered_domain, e.g. example.com) to filter for via '
+              'the SQL index; also matches subdomains. Combine with --hostnames; mutually exclusive '
+              'with --query/--query-file.'),
+    )
+    parser.add_argument(
+        '--query',
+        type=str,
+        default=None,
+        help=('Raw SQL to run instead of the hostname-based query (power users). Must SELECT the '
+              'columns warc_filename, warc_record_offset, warc_record_length. Engine-specific '
+              'dialect. Mutually exclusive with --hostnames and --query-file.'),
+    )
+    parser.add_argument(
+        '--query-file',
+        type=str,
+        default=None,
+        help='Path to a file containing the raw SQL (alternative to --query).',
     )
     parser.add_argument(
         '--athena-database',
         type=str,
         default=None,
-        help='Athena database. Required if target source is set to `athena`.',
+        help='Athena database (engine=athena). Defaults to `ccindex`.',
     )
     parser.add_argument(
         '--athena-s3-output',
         type=str,
         default=None,
-        help='Athena S3 output location. Required if target source is set to `athena`.',
+        help='Athena S3 output location (engine=athena). Required for engine=athena.',
+    )
+    parser.add_argument(
+        '--duckdb-index-path',
+        type=str,
+        default='s3://commoncrawl/cc-index/table/cc-main/warc/',
+        help='Base S3 path to the CC columnar index parquet (engine=duckdb).',
     )
     parser.add_argument(
-        '--confirm-athena-cost',
+        '--confirm-cost',
         action='store_true',
-        help=('Skip the Athena cost-confirmation prompt and run even unpartitioned / large-scan '
+        help=('Skip the cost-confirmation prompt and run even unpartitioned / large-scan SQL '
               'queries. Athena bills per TB scanned; restrict with --crawl to reduce cost.'),
     )
+    # --- CSV source ---
+    parser.add_argument(
+        '--csv-path',
+        type=str,
+        default=None,
+        help='Path to a range-jobs CSV/TSV (local or remote). Used when --target-source csv.',
+    )
+    # --- Range-jobs materialization (any source) ---
+    parser.add_argument(
+        '--range-jobs-output',
+        type=str,
+        default=None,
+        help='If set, write each generated RangeJob to this CSV (filename,offset,length by default).',
+    )
+    parser.add_argument(
+        '--no-fetch',
+        action='store_true',
+        help='Only generate range jobs (write --range-jobs-output); skip fetching/writing WARCs.',
+    )
+    parser.add_argument(
+        '--csv-self-contained',
+        action='store_true',
+        help='Write full URLs (url,offset,length) to --range-jobs-output instead of relative filenames.',
+    )
     parser.add_argument('--prefix', default='TEST', help='prefix for the output warc filename')
     parser.add_argument(
         '--subprefix',
@@ -126,8 +176,9 @@ def add_warcer_by_cdx_args(parser: argparse.ArgumentParser):
         '--target-source',
         action='store',
         default='cdx',
-        help=('Source from that the filter targets are loaded (available options: `cdx`, `athena`; '
-              'defaults to `cdx`). For `athena`, use the global --crawl to restrict the scan to '
-              'specific crawls (strongly recommended; Athena bills per TB scanned).'),
+        choices=['cdx', 'sql', 'csv'],
+        help=('Where range jobs come from: `cdx` (index files), `sql` (columnar index via '
+              '--engine athena|duckdb), or `csv` (a range-jobs CSV). Defaults to `cdx`. For `sql`, '
+              'use the global --crawl to restrict the scan to specific crawls (recommended for cost).'),
     )
     return parser