Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from cdx_toolkit.filter_cdx.command import run_filter_cdx
from cdx_toolkit.filter_cdx.args import add_filter_cdx_args

from cdx_toolkit.filter_warc.command import run_warcer_by_cdx
from cdx_toolkit.filter_warc.args import add_warcer_by_cdx_args
from cdx_toolkit.filter_warc.command import run_repackage
from cdx_toolkit.filter_warc.args import add_repackage_args


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -124,12 +124,12 @@ def main(args=None):
warc.add_argument('url')
warc.set_defaults(func=warcer)

warc_by_cdx = subparsers.add_parser(
'warc_by_cdx',
help='iterate over capture content based on an CDX index file, creating a warc'
repackage = subparsers.add_parser(
'repackage',
help='repackage WARC ranges from a CDX/SQL/CSV source into a new WARC'
)
add_warcer_by_cdx_args(warc_by_cdx)
warc_by_cdx.set_defaults(func=run_warcer_by_cdx)
add_repackage_args(repackage)
repackage.set_defaults(func=run_repackage)

filter_cdx = subparsers.add_parser('filter_cdx', help='Filter CDX files based on SURT prefixes whitelist')
add_filter_cdx_args(filter_cdx)
Expand Down
89 changes: 70 additions & 19 deletions cdx_toolkit/filter_warc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,59 +5,109 @@
logger = logging.getLogger(__name__)


def add_warcer_by_cdx_args(parser: argparse.ArgumentParser):
def add_repackage_args(parser: argparse.ArgumentParser):
# --- CDX source ---
parser.add_argument(
'--cdx-path',
type=str,
default=None,
help='Path to CDX index file (local or remote, e.g. S3). Required if target source is set to `cdx`.',
help='Path to CDX index file (local or remote, e.g. S3). Used when --target-source cdx.',
)
parser.add_argument(
'--cdx-glob',
type=str,
default=None,
help='a glob pattern for read from multiple CDX indices',
)
# --- SQL source (--target-source sql --engine athena|duckdb) ---
parser.add_argument(
'--athena-hostnames',
'--engine',
type=str,
nargs="+",
default=None,
help=('Hostnames to filter for via Athena (whitelist). Use this OR --athena-query/'
'--athena-query-file (mutually exclusive) when target source is `athena`.'),
choices=['athena', 'duckdb'],
help='SQL engine for the columnar index. Required when --target-source sql.',
)
parser.add_argument(
'--athena-query',
'--hostnames',
type=str,
nargs='+',
default=None,
help=('Raw Athena SQL to run instead of the hostname-based query (power users). The query '
'must SELECT the columns warc_filename, warc_record_offset, warc_record_length. '
'Mutually exclusive with --athena-hostnames and --athena-query-file.'),
help=('Exact hostnames (url_host_name, e.g. www.example.com) to filter for via the SQL '
'index. Combine with --domains; mutually exclusive with --query/--query-file. '
'Combine with the global --crawl to restrict the scan to specific crawls '
'(strongly recommended for cost).'),
)
parser.add_argument(
'--athena-query-file',
'--domains',
type=str,
nargs='+',
default=None,
help='Path to a file containing the raw Athena SQL (alternative to --athena-query).',
help=('Registered domains (url_host_registered_domain, e.g. example.com) to filter for via '
'the SQL index; also matches subdomains. Combine with --hostnames; mutually exclusive '
'with --query/--query-file.'),
)
parser.add_argument(
'--query',
type=str,
default=None,
help=('Raw SQL to run instead of the hostname-based query (power users). Must SELECT the '
'columns warc_filename, warc_record_offset, warc_record_length. Engine-specific '
'dialect. Mutually exclusive with --hostnames and --query-file.'),
)
parser.add_argument(
'--query-file',
type=str,
default=None,
help='Path to a file containing the raw SQL (alternative to --query).',
)
parser.add_argument(
'--athena-database',
type=str,
default=None,
help='Athena database. Required if target source is set to `athena`.',
help='Athena database (engine=athena). Defaults to `ccindex`.',
)
parser.add_argument(
'--athena-s3-output',
type=str,
default=None,
help='Athena S3 output location. Required if target source is set to `athena`.',
help='Athena S3 output location (engine=athena). Required for engine=athena.',
)
parser.add_argument(
'--duckdb-index-path',
type=str,
default='s3://commoncrawl/cc-index/table/cc-main/warc/',
help='Base S3 path to the CC columnar index parquet (engine=duckdb).',
)
parser.add_argument(
'--confirm-athena-cost',
'--confirm-cost',
action='store_true',
help=('Skip the Athena cost-confirmation prompt and run even unpartitioned / large-scan '
help=('Skip the cost-confirmation prompt and run even unpartitioned / large-scan SQL '
'queries. Athena bills per TB scanned; restrict with --crawl to reduce cost.'),
)
# --- CSV source ---
parser.add_argument(
'--csv-path',
type=str,
default=None,
help='Path to a range-jobs CSV/TSV (local or remote). Used when --target-source csv.',
)
# --- Range-jobs materialization (any source) ---
parser.add_argument(
'--range-jobs-output',
type=str,
default=None,
help='If set, write each generated RangeJob to this CSV (filename,offset,length by default).',
)
parser.add_argument(
'--no-fetch',
action='store_true',
help='Only generate range jobs (write --range-jobs-output); skip fetching/writing WARCs.',
)
parser.add_argument(
'--csv-self-contained',
action='store_true',
help='Write full URLs (url,offset,length) to --range-jobs-output instead of relative filenames.',
)
parser.add_argument('--prefix', default='TEST', help='prefix for the output warc filename')
parser.add_argument(
'--subprefix',
Expand Down Expand Up @@ -126,8 +176,9 @@ def add_warcer_by_cdx_args(parser: argparse.ArgumentParser):
'--target-source',
action='store',
default='cdx',
help=('Source from that the filter targets are loaded (available options: `cdx`, `athena`; '
'defaults to `cdx`). For `athena`, use the global --crawl to restrict the scan to '
'specific crawls (strongly recommended; Athena bills per TB scanned).'),
choices=['cdx', 'sql', 'csv'],
help=('Where range jobs come from: `cdx` (index files), `sql` (columnar index via '
'--engine athena|duckdb), or `csv` (a range-jobs CSV). Defaults to `cdx`. For `sql`, '
'use the global --crawl to restrict the scan to specific crawls (recommended for cost).'),
)
return parser
Loading