From 6f3028b92b346bbd5d0b954adb14e18c0e031928 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 12 Feb 2026 18:03:58 +0100 Subject: [PATCH 1/3] fix: load the parquet files recursively and use the query of duckdb to select the crawl and the subset, as was done in the Java Tour --- Makefile | 7 +++++-- duck.py | 32 +++++++++++++++++--------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index a97cee0..1e120e3 100644 --- a/Makefile +++ b/Makefile @@ -62,11 +62,14 @@ CC-MAIN-2024-22.warc.paths.gz: aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz duck_local_files: +ifndef LOCAL_DIR + $(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data) +endif @echo "warning! 300 gigabyte download" - python duck.py local_files + python duck.py local_files $(LOCAL_DIR) duck_ccf_local_files: - @echo "warning! only works on Common Crawl Foundadtion's development machine" + @echo "warning! only works on Common Crawl Foundation's development machine" python duck.py ccf_local_files duck_cloudfront: diff --git a/duck.py b/duck.py index af1c677..0da7560 100644 --- a/duck.py +++ b/duck.py @@ -1,3 +1,5 @@ +from pathlib import Path + import time import glob import json @@ -48,23 +50,15 @@ def print_row_as_kv_list(row): all_algos = ('s3_glob', 'local_files', 'ccf_local_files', 'cloudfront_glob', 'cloudfront') -def get_files(algo, crawl): +def get_files(algo, crawl, local_prefix=None): if algo == 's3_glob': # 403 errors with and without credentials. you have to be commoncrawl-pds files = f's3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet' raise NotImplementedError('will cause a 403') elif algo == 'local_files': - files = os.path.expanduser(f'~/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet') - files = glob.glob(files) - # did we already download? we expect 300 files of about a gigabyte - if len(files) < 250: - index_download_advice('~', crawl) - exit(1) + files = [str(f) for f in Path(os.path.expanduser(f'{local_prefix}')).rglob('*.parquet')] elif algo == 'ccf_local_files': - files = glob.glob(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet') - if len(files) < 250: - index_download_advice('/home/cc-pds', crawl) - exit(1) + files = [str(f) for f in Path(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc').rglob('*.parquet')] elif algo == 'cloudfront_glob': # duckdb can't glob this, same reason as s3_glob above files = f'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet' @@ -82,12 +76,12 @@ def get_files(algo, crawl): return files -def main(algo, crawl): +def main(algo, crawl, local_prefix=None): windows = True if platform.system() == 'Windows' else False if windows: # windows stdout is often cp1252 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') - files = get_files(algo, crawl) + files = get_files(algo, crawl, local_prefix) retries_left = 100 while True: @@ -113,7 +107,7 @@ def main(algo, crawl): retries_left = 100 while True: try: - print(duckdb.sql('SELECT COUNT(*) FROM ccindex;')) + print(duckdb.sql(f"SELECT COUNT(*) FROM ccindex WHERE subset = 'warc' and crawl = '{crawl}';")) break except duckdb.InvalidInputException as e: # duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...' @@ -176,13 +170,21 @@ def main(algo, crawl): if __name__ == '__main__': crawl = 'CC-MAIN-2024-22' + local_prefix = None if len(sys.argv) > 1: algo = sys.argv[1] if algo == 'help': print('possible algos:', all_algos) exit(1) + elif algo == 'local_files': + if len(sys.argv) < 2: + print('for local_files algo, you must provide a local prefix as the second argument') + exit(1) + else: + local_prefix = sys.argv[2] + print(f"Using local prefix {local_prefix}") else: algo = 'cloudfront' print('using algo: ', algo) - main(algo, crawl) + main(algo, crawl, local_prefix) From 8adcbb0a35b16f8ee976a5540571d74b82799660 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 12 Feb 2026 19:55:30 +0100 Subject: [PATCH 2/3] chore(docs): update documentation with instruction on how to download the crawl data with and without the AWS CLI --- README.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2754c6a..df1505c 100644 --- a/README.md +++ b/README.md @@ -549,9 +549,54 @@ The program then writes that one record into a local Parquet file, does a second ### Bonus: download a full crawl index and query with DuckDB -If you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. Run +In case you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. -```make duck_local_files``` +> [!IMPORTANT] +> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files``` + +To download the crawl index, there are two options: if you have access to the CCF AWS buckets, run: + +```shell +mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc' +aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ 'crawl=CC-MAIN-2024-22/subset=warc' +``` + +If, by any other chance, you don't have access through the AWS CLI: + +```shell +mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc' +cd 'crawl=CC-MAIN-2024-22/subset=warc' + +wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/cc-index-table.paths.gz +gunzip cc-index-table.paths.gz + +grep 'subset=warc' cc-index-table.paths | \ + awk '{print "https://data.commoncrawl.org/" $1, $1}' | \ + xargs -n 2 -P 10 sh -c ' + echo "Downloading: $2" + mkdir -p "$(dirname "$2")" && + wget -O "$2" "$1" + ' _ + +rm cc-index-table.paths +cd - +``` + +In both ways, the file structure should be something like this: +```shell +tree my_data +my_data +└── crawl=CC-MAIN-2024-22 + └── subset=warc + ├── part-00000-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet + ├── part-00001-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet + ├── part-00002-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet +``` + + +Then run: + +Then, you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files. If the files aren't already downloaded, this command will give you download instructions. From 457c8152bb082304235397b4108ec339c8271a7d Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 12 Feb 2026 20:01:07 +0100 Subject: [PATCH 3/3] fix: parametrize the crawl name --- duck.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/duck.py b/duck.py index 0da7560..09e83d8 100644 --- a/duck.py +++ b/duck.py @@ -119,17 +119,17 @@ def main(algo, crawl, local_prefix=None): else: raise - sq2 = f''' + sq2 = f""" select * from ccindex where subset = 'warc' - and crawl = 'CC-MAIN-2024-22' + and crawl = '{crawl}' and url_host_tld = 'org' -- help the query optimizer and url_host_registered_domain = 'wikipedia.org' -- ditto and url = 'https://an.wikipedia.org/wiki/Escopete' ; - ''' + """ row2 = duckdb.sql(sq2) print('our one row')