Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,14 @@ CC-MAIN-2024-22.warc.paths.gz:
aws s3 ls s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ | awk '{print $$4}' | gzip -9 > CC-MAIN-2024-22.warc.paths.gz

duck_local_files:
ifndef LOCAL_DIR
$(error LOCAL_DIR is required. Usage: make duck_local_files LOCAL_DIR=/path/to/data)
endif
@echo "warning! 300 gigabyte download"
python duck.py local_files
python duck.py local_files $(LOCAL_DIR)

duck_ccf_local_files:
@echo "warning! only works on Common Crawl Foundadtion's development machine"
@echo "warning! only works on Common Crawl Foundation's development machine"
python duck.py ccf_local_files

duck_cloudfront:
Expand Down
49 changes: 47 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -549,9 +549,54 @@ The program then writes that one record into a local Parquet file, does a second

### Bonus: download a full crawl index and query with DuckDB

If you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly. Run
In case you want to run many of these queries, and you have a lot of disk space, you'll want to download the 300 gigabyte index and query it repeatedly.

```make duck_local_files```
> [!IMPORTANT]
> If you happen to be using the Common Crawl Foundation development server, we've already downloaded these files, and you can run ```make duck_ccf_local_files```

To download the crawl index, there are two options: if you have access to the CCF AWS buckets, run:

```shell
mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc'
aws s3 sync s3://commoncrawl/cc-index/table/cc-main/warc/crawl=CC-MAIN-2024-22/subset=warc/ 'crawl=CC-MAIN-2024-22/subset=warc'
```

If, by any other chance, you don't have access through the AWS CLI:

```shell
mkdir -p 'crawl=CC-MAIN-2024-22/subset=warc'
cd 'crawl=CC-MAIN-2024-22/subset=warc'

wget https://data.commoncrawl.org/crawl-data/CC-MAIN-2024-22/cc-index-table.paths.gz
gunzip cc-index-table.paths.gz

grep 'subset=warc' cc-index-table.paths | \
awk '{print "https://data.commoncrawl.org/" $1, $1}' | \
xargs -n 2 -P 10 sh -c '
echo "Downloading: $2"
mkdir -p "$(dirname "$2")" &&
wget -O "$2" "$1"
' _

rm cc-index-table.paths
cd -
```

In both ways, the file structure should be something like this:
```shell
tree my_data
my_data
└── crawl=CC-MAIN-2024-22
└── subset=warc
├── part-00000-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet
├── part-00001-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet
├── part-00002-4dd72944-e9c0-41a1-9026-dfd2d0615bf2.c000.gz.parquet
```


Then run:

Then, you can run `make duck_local_files LOCAL_DIR=/path/to/the/downloaded/data` to run the same query as above, but this time using your local copy of the index files.

If the files aren't already downloaded, this command will give you
download instructions.
Expand Down
38 changes: 20 additions & 18 deletions duck.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import time
import glob
import json
Expand Down Expand Up @@ -48,23 +50,15 @@ def print_row_as_kv_list(row):
all_algos = ('s3_glob', 'local_files', 'ccf_local_files', 'cloudfront_glob', 'cloudfront')


def get_files(algo, crawl):
def get_files(algo, crawl, local_prefix=None):
if algo == 's3_glob':
# 403 errors with and without credentials. you have to be commoncrawl-pds
files = f's3://commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet'
raise NotImplementedError('will cause a 403')
elif algo == 'local_files':
files = os.path.expanduser(f'~/commmoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet')
files = glob.glob(files)
# did we already download? we expect 300 files of about a gigabyte
if len(files) < 250:
index_download_advice('~', crawl)
exit(1)
files = [str(f) for f in Path(os.path.expanduser(f'{local_prefix}')).rglob('*.parquet')]
elif algo == 'ccf_local_files':
files = glob.glob(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet')
if len(files) < 250:
index_download_advice('/home/cc-pds', crawl)
exit(1)
files = [str(f) for f in Path(f'/home/cc-pds/commoncrawl/cc-index/table/cc-main/warc').rglob('*.parquet')]
elif algo == 'cloudfront_glob':
# duckdb can't glob this, same reason as s3_glob above
files = f'https://data.commoncrawl.org/cc-index/table/cc-main/warc/crawl={crawl}/subset=warc/*.parquet'
Expand All @@ -82,12 +76,12 @@ def get_files(algo, crawl):
return files


def main(algo, crawl):
def main(algo, crawl, local_prefix=None):
windows = True if platform.system() == 'Windows' else False
if windows:
# windows stdout is often cp1252
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
files = get_files(algo, crawl)
files = get_files(algo, crawl, local_prefix)
retries_left = 100

while True:
Expand All @@ -113,7 +107,7 @@ def main(algo, crawl):
retries_left = 100
while True:
try:
print(duckdb.sql('SELECT COUNT(*) FROM ccindex;'))
print(duckdb.sql(f"SELECT COUNT(*) FROM ccindex WHERE subset = 'warc' and crawl = '{crawl}';"))
break
except duckdb.InvalidInputException as e:
# duckdb.duckdb.InvalidInputException: Invalid Input Error: No magic bytes found at end of file 'https://...'
Expand All @@ -125,17 +119,17 @@ def main(algo, crawl):
else:
raise

sq2 = f'''
sq2 = f"""
select
*
from ccindex
where subset = 'warc'
and crawl = 'CC-MAIN-2024-22'
and crawl = '{crawl}'
and url_host_tld = 'org' -- help the query optimizer
and url_host_registered_domain = 'wikipedia.org' -- ditto
and url = 'https://an.wikipedia.org/wiki/Escopete'
;
'''
"""

row2 = duckdb.sql(sq2)
print('our one row')
Expand Down Expand Up @@ -176,13 +170,21 @@ def main(algo, crawl):

if __name__ == '__main__':
crawl = 'CC-MAIN-2024-22'
local_prefix = None
if len(sys.argv) > 1:
algo = sys.argv[1]
if algo == 'help':
print('possible algos:', all_algos)
exit(1)
elif algo == 'local_files':
if len(sys.argv) < 2:
print('for local_files algo, you must provide a local prefix as the second argument')
exit(1)
else:
local_prefix = sys.argv[2]
print(f"Using local prefix {local_prefix}")
else:
algo = 'cloudfront'
print('using algo: ', algo)

main(algo, crawl)
main(algo, crawl, local_prefix)
Loading