diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 116a692b..30f83ff2 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -35,7 +35,7 @@ jobs: - name: Build with Jekyll run: bundle exec jekyll build working-directory: ./docs - - name: Run Lychee + - name: Check online links (lychee) uses: lycheeverse/lychee-action@v2 with: args: >- @@ -45,4 +45,26 @@ jobs: --root-dir ${{ github.workspace }}/docs/_site ./_site workingDirectory: ./docs - fail: true \ No newline at end of file + fail: true + - name: Set up Python for offline link check + uses: actions/setup-python@v5 + with: + python-version: '3.14' + - name: Install Python deps + run: pip install -r requirements.txt + - name: Check offline links (check_links.py) + run: >- + python scripts/check_links.py + --offline --include-fragments + --index-files index.html + --root-dir docs/_site-offline + docs/_site-offline + - name: Check for surviving live-site links in offline tree + # Flags any https://docs.twinbasic.com/ reference left in + # _site-offline/ HTML outside /
 blocks. After offlinify
+        # strips the jekyll-seo-tag block, anything surviving is a source
+        # link that points at the live site instead of using a relative or
+        # /tB/... permalink that resolves locally. The bare root URL
+        # (https://docs.twinbasic.com[/]) is exempt -- intentional "go to
+        # the live site" links are allowed.
+        run: python scripts/check_offline_live_links.py
\ No newline at end of file
diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml
index f0e35a02..346d02ad 100644
--- a/.github/workflows/jekyll-gh-pages.yml
+++ b/.github/workflows/jekyll-gh-pages.yml
@@ -57,7 +57,7 @@ jobs:
         env:
           JEKYLL_ENV: production
           PAGES_REPO_NWO: "${{ github.repository }}"
-      - name: Run Lychee against the online tree
+      - name: Check online links (lychee)
         uses: lycheeverse/lychee-action@v2
         with:
           # --remap matches the fully-resolved file URI (not the raw href), so the pattern
@@ -68,6 +68,11 @@ jobs:
           # `--fallback-extensions html` mirrors what GitHub Pages does at request time:
           # an extensionless URL like `/FAQ` is served as `/FAQ.html`. Without the flag
           # lychee would flag every pretty permalink on the site.
+          #
+          # Lychee, not the Python checker, handles the online tree here because the
+          # `--remap` flag isn't implemented by scripts/check_links.py; the offline tree
+          # below has all baseurl prefixes already stripped by the offlinify plugin and
+          # so doesn't need it.
           args: >-
             --offline --include-fragments
             --fallback-extensions html
@@ -77,22 +82,34 @@ jobs:
             ./_site
           workingDirectory: ./docs
           fail: true
-      - name: Run Lychee against the offline tree
-        uses: lycheeverse/lychee-action@v2
+      - name: Set up Python for offline link check
+        uses: actions/setup-python@v5
         with:
-          # Strict check on `_site-offline/`: every link must resolve to an actual file
-          # under `file://`, with no extension fallback. Catches relative links in
-          # markdown sources that point at a permalink that doesn't match the rendered
-          # filename (e.g. `[Foo](Foo/)` when Jekyll wrote `Foo.html`, not
-          # `Foo/index.html`) -- the kind of breakage the online check above hides
-          # behind `--fallback-extensions html`.
-          args: >-
-            --offline --include-fragments
-            --index-files 'index.html'
-            --root-dir ${{ github.workspace }}/docs/_site-offline
-            ./_site-offline
-          workingDirectory: ./docs
-          fail: true
+          python-version: '3.14'
+      - name: Install Python deps
+        run: pip install -r requirements.txt
+      - name: Check offline links (check_links.py)
+        # Strict check on `_site-offline/`: every link must resolve to an actual file
+        # under `file://`, with no extension fallback. Catches relative links in
+        # markdown sources that point at a permalink that doesn't match the rendered
+        # filename (e.g. `[Foo](Foo/)` when Jekyll wrote `Foo.html`, not
+        # `Foo/index.html`) -- the kind of breakage the online check above hides
+        # behind `--fallback-extensions html`.
+        run: >-
+          python scripts/check_links.py
+          --offline --include-fragments
+          --index-files index.html
+          --root-dir docs/_site-offline
+          docs/_site-offline
+      - name: Check for surviving live-site links in offline tree
+        # Flags any https://docs.twinbasic.com/ reference left in
+        # _site-offline/ HTML outside /
 blocks. After offlinify
+        # strips the jekyll-seo-tag block, anything surviving is a source
+        # link that points at the live site instead of using a relative or
+        # /tB/... permalink that resolves locally. The bare root URL
+        # (https://docs.twinbasic.com[/]) is exempt -- intentional "go to
+        # the live site" links are allowed.
+        run: python scripts/check_offline_live_links.py
       - name: Upload Pages artifact
         uses: actions/upload-pages-artifact@v5
         with:
diff --git a/docs/Miscellaneous/Documentation Development.md b/docs/Miscellaneous/Documentation Development.md
index 6105eac1..9ea54887 100644
--- a/docs/Miscellaneous/Documentation Development.md	
+++ b/docs/Miscellaneous/Documentation Development.md	
@@ -201,7 +201,7 @@ To check that none of the internal links in the most recent documentation build
 
     check.bat
 
-This runs three checks: [Lychee](https://github.com/lycheeverse/lychee) in offline mode against `_site/` (the live tree), the same against `_site-offline/` (the file://-browsable mirror), and a small Python pass over `_site-offline/` that flags any surviving `https://docs.twinbasic.com/` link --- the offline mirror should not navigate back to the live docs site.
+This runs three checks: `scripts/check_links.py` against `_site/` (the live tree, in offline mode), the same against `_site-offline/` (the file://-browsable mirror), and `scripts/check_offline_live_links.py` over `_site-offline/` that flags any surviving `https://docs.twinbasic.com/` link --- the offline mirror should not navigate back to the live docs site. The same three checks run in CI on every pull request and on every push to `staging`.
 
 ### Building and Local Serving
 
diff --git a/docs/_plugins/offlinify.md b/docs/_plugins/offlinify.md
index 979afe7b..549a8db9 100644
--- a/docs/_plugins/offlinify.md
+++ b/docs/_plugins/offlinify.md
@@ -300,22 +300,25 @@ The offline build touches the following files:
 | `docs/_config.yml` | `also_build_offline: true` (default-on) and `exclude: [_site-offline]` (keeps Jekyll's watcher from rebuilding on the plugin's own output). |
 | `docs/build.bat` | Plain `bundle exec jekyll build` — produces `_site/`, `_site-offline/`, and (via `pdfify.rb`) `_site-pdf/` in one run. |
 | `docs/serve.bat` | `bundle exec jekyll serve` — watcher-friendly thanks to the exclude. |
-| `docs/check.bat` | Local link check (dev-side only; CI runs the two lychee passes directly). Three steps: lychee permissive on `_site/`, lychee strict on `_site-offline/`, and `scripts/check_offline_live_links.py` against `_site-offline/`. Exits non-zero on any failure. |
-| `scripts/check_offline_live_links.py` | Flags any `https://docs.twinbasic.com/` reference that survived offlinify in `_site-offline/` HTML, outside `` / `
` blocks. Skips the bare root (`https://docs.twinbasic.com[/]`) since intentional "go to the live site" links are allowed. Caught locally by `check.bat`; not wired into CI. |
+| `docs/check.bat` | Local link check (CI runs the same three passes via the workflows). Three steps: `scripts/check_links.py` permissive on `_site/`, `scripts/check_links.py` strict on `_site-offline/`, and `scripts/check_offline_live_links.py` against `_site-offline/`. Exits non-zero on any failure. |
+| `scripts/check_offline_live_links.py` | Flags any `https://docs.twinbasic.com/` reference that survived offlinify in `_site-offline/` HTML, outside `` / `
` blocks. Skips the bare root (`https://docs.twinbasic.com[/]`) since intentional "go to the live site" links are allowed. Run by `check.bat` locally and by both CI workflows after the offline link check. |
 | `docs/.gitignore` | `_site`, `_site-offline`, and `_site-pdf` all excluded from git. |
-| `.github/workflows/jekyll-gh-pages.yml` | CI workflow. Builds, runs lychee against both trees, deploys to Pages, and (on manual dispatch) packages `_site-offline/` as a release artifact. |
+| `.github/workflows/jekyll-gh-pages.yml` | Deploy workflow (push to `staging`, manual dispatch). Builds, runs lychee against `_site/`, runs `scripts/check_links.py` against `_site-offline/`, runs `scripts/check_offline_live_links.py` against `_site-offline/`, deploys to Pages, and (on manual dispatch) packages `_site-offline/` as a release artifact. |
+| `.github/workflows/checks.yml` | PR-gating workflow (pull-request to `main`, manual dispatch). Same three link-check steps as the deploy workflow; no deploy or release. |
 
 ## CI integration
 
 `bundle exec jekyll build` in CI passes `--baseurl "${{ steps.pages.outputs.base_path }}"` from `actions/configure-pages`. For a Pages site with a custom domain (CNAME), base_path is empty. For a project page without a custom domain, it's `/repo-name`. Offlinify handles both cases — `normalize_baseurl` in `setup` produces the right prefix to strip.
 
-The workflow has two lychee steps after the build:
+The workflow has three link-check steps after the build:
 
-1. **Against `_site/`**, with `--fallback-extensions html` and a `--remap` that strips the base_path prefix. This mirrors what GitHub Pages does at request time — extensionless URLs like `/FAQ` get served as `/FAQ.html`. Without `--fallback-extensions html`, every pretty permalink would appear broken in this check.
+1. **Lychee against `_site/`**, with `--fallback-extensions html` and a `--remap` that strips the base_path prefix. This mirrors what GitHub Pages does at request time — extensionless URLs like `/FAQ` get served as `/FAQ.html`. Without `--fallback-extensions html`, every pretty permalink would appear broken in this check. Lychee (not `scripts/check_links.py`) handles the online tree because `--remap` isn't implemented in the Python checker; the offline tree below has all baseurl prefixes already stripped by offlinify and doesn't need it.
 
-2. **Against `_site-offline/`**, strict — no extension fallback (`--index-files 'index.html'` only; the online check also accepts the bare directory via `,.`). Every link must resolve to a real file as written. This catches relative links in markdown sources whose permalink shape doesn't match the rendered filename (e.g. `[Foo](Foo/)` when Jekyll wrote `Foo.html`, not `Foo/index.html`) — the kind of breakage the online check above hides behind both the fallback and the bare-directory acceptance.
+2. **`scripts/check_links.py` against `_site-offline/`**, strict — no extension fallback (`--index-files index.html` only; the online check also accepts the bare directory via `,.`). Every link must resolve to a real file as written. This catches relative links in markdown sources whose permalink shape doesn't match the rendered filename (e.g. `[Foo](Foo/)` when Jekyll wrote `Foo.html`, not `Foo/index.html`) — the kind of breakage the online check above hides behind both the fallback and the bare-directory acceptance. The Python checker is roughly 25× faster than lychee on this workload and a bit stricter (catches missing `
+
+
+
+

Fixture

+ +

Plain hrefs

+OK file +BROKEN: missing file + +

Fallback extension (only good-fallback.html exists)

+OK via fallback .html +BROKEN: no missing-fallback.html + +

Trailing slash forces directory resolution

+BROKEN: good.html is a file, not a dir +BROKEN: good-fallback resolves to .html via fallback, not via dir + +

Directory with index.html (both forms should be OK)

+OK dir with index +OK dir with index (no slash) + +

Directory without index.html, but `.` in --index-files

+OK: accepts dir itself +OK: same, via dir fallback + +

Missing dirs

+BROKEN: dir does not exist +BROKEN: not a file, not a dir, no fallback + +

Fragments

+OK: same-page fragment +BROKEN: same-page bad fragment +OK: cross-page known fragment +BROKEN: cross-page bad fragment +OK: dir-link with fragment +BROKEN: dir-link bad fragment + +

Relative and absolute paths

+OK relative subdir +OK absolute via root-dir +BROKEN absolute + +

External and skipped schemes

+SKIP: http +SKIP: mailto +SKIP: js +SKIP: tel + +

Images

+OK +BROKEN + +

Image srcset

+srcset mixed + + diff --git a/experiments/lcheck-fixture/site/subdir/image.png b/experiments/lcheck-fixture/site/subdir/image.png new file mode 100644 index 00000000..7838a96f --- /dev/null +++ b/experiments/lcheck-fixture/site/subdir/image.png @@ -0,0 +1 @@ +fake png \ No newline at end of file diff --git a/experiments/lcheck-fixture/site/subdir/nested.html b/experiments/lcheck-fixture/site/subdir/nested.html new file mode 100644 index 00000000..1c22b823 --- /dev/null +++ b/experiments/lcheck-fixture/site/subdir/nested.html @@ -0,0 +1,4 @@ + +nested +up and over +BROKEN: missing from subdir diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..4eb0354a --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +selectolax>=0.4 diff --git a/scripts/check_links.py b/scripts/check_links.py new file mode 100644 index 00000000..cea09f4f --- /dev/null +++ b/scripts/check_links.py @@ -0,0 +1,401 @@ +""" +Offline link checker for static sites. + +CLI mirrors the subset of lychee flags used by docs/check.bat, so that an +invocation like + + python scripts/check_links.py --offline --include-fragments + --fallback-extensions html --index-files "index.html,." + --root-dir docs/_site docs/_site + +produces the same correctness verdict as the equivalent lychee call (only +faster and a bit stricter -- see "Differences from lychee" below). + +Why this exists: lychee's offline pipeline funnels every link occurrence +through an async channel before its dedup cache short-circuits the work. +On this site (~733k occurrences, ~12k unique targets) that fixed-per- +occurrence overhead is ~50s on Windows. This script dedupes (target, frag) +up front, so the filesystem and fragment checks run once per unique target. + +Online (network) link checking is not implemented. --offline is therefore +required; the script exits non-zero if it is absent. + +Differences from lychee (correctness): + * Trailing slash on a file-shaped URL ('foo.html/') is reported broken, + where lychee normalises and accepts. Catches authoring mistakes. + *