diff --git a/.github/workflows/blog-syndication.yml b/.github/workflows/blog-syndication.yml new file mode 100644 index 0000000000..011d23355f --- /dev/null +++ b/.github/workflows/blog-syndication.yml @@ -0,0 +1,109 @@ +name: Syndicate Blog Posts + +on: + schedule: + # Daily at 13:00 UTC. Runs from the default branch only, per GitHub's cron rules. + - cron: '0 13 * * *' + workflow_dispatch: + inputs: + dry_run: + description: 'Skip API calls and only print what would happen.' + type: boolean + default: false + +permissions: + contents: write + +concurrency: + group: blog-syndication + cancel-in-progress: false + +jobs: + syndicate: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + # Token with write scope so the post-run commit can push the state file. + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Run API syndication script + id: syndicate_api + env: + DEVTO_API_KEY: ${{ secrets.DEVTO_API_KEY }} + HASHNODE_TOKEN: ${{ secrets.HASHNODE_TOKEN }} + HASHNODE_PUBLICATION_ID: ${{ secrets.HASHNODE_PUBLICATION_ID }} + run: | + set -euo pipefail + if [ "${{ inputs.dry_run }}" = "true" ]; then + python3 scripts/website/syndicate_blog_posts.py --dry-run + else + python3 scripts/website/syndicate_blog_posts.py + fi + + - name: Detect browser-syndication credentials + id: browser_creds + env: + FOOJAY_USER: ${{ secrets.FOOJAY_USER }} + HACKERNOON_USER: ${{ secrets.HACKERNOON_USER }} + DZONE_STORAGE_STATE: ${{ secrets.DZONE_STORAGE_STATE }} + MEDIUM_STORAGE_STATE: ${{ secrets.MEDIUM_STORAGE_STATE }} + run: | + if [ -n "${FOOJAY_USER}" ] || [ -n "${HACKERNOON_USER}" ] || [ -n "${DZONE_STORAGE_STATE}" ] || [ -n "${MEDIUM_STORAGE_STATE}" ]; then + echo "any_configured=true" >> "${GITHUB_OUTPUT}" + else + echo "any_configured=false" >> "${GITHUB_OUTPUT}" + fi + + - name: Install Playwright dependencies + if: ${{ steps.browser_creds.outputs.any_configured == 'true' }} + run: | + set -euo pipefail + pip install playwright markdown + playwright install --with-deps chromium + + - name: Run browser syndication script + if: ${{ steps.browser_creds.outputs.any_configured == 'true' }} + env: + FOOJAY_USER: ${{ secrets.FOOJAY_USER }} + FOOJAY_PASSWORD: ${{ secrets.FOOJAY_PASSWORD }} + HACKERNOON_USER: ${{ secrets.HACKERNOON_USER }} + HACKERNOON_PASSWORD: ${{ secrets.HACKERNOON_PASSWORD }} + DZONE_STORAGE_STATE: ${{ secrets.DZONE_STORAGE_STATE }} + MEDIUM_STORAGE_STATE: ${{ secrets.MEDIUM_STORAGE_STATE }} + run: | + set -euo pipefail + if [ "${{ inputs.dry_run }}" = "true" ]; then + python3 scripts/website/syndicate_browser_posts.py --dry-run + else + python3 scripts/website/syndicate_browser_posts.py + fi + + - name: Upload syndication screenshots on failure + if: ${{ always() && hashFiles('docs/website/reports/syndication-screenshots/**/*.png') != '' }} + uses: actions/upload-artifact@v4 + with: + name: syndication-screenshots + path: docs/website/reports/syndication-screenshots/ + if-no-files-found: ignore + retention-days: 14 + + - name: Commit updated syndication state + if: ${{ inputs.dry_run != true }} + run: | + set -euo pipefail + if git diff --quiet -- scripts/website/syndication-state.json; then + echo "No state changes to commit." + exit 0 + fi + git config user.name 'github-actions[bot]' + git config user.email 'github-actions[bot]@users.noreply.github.com' + git add scripts/website/syndication-state.json + git commit -m "ci: record blog syndication results" + git push diff --git a/.gitignore b/.gitignore index 82e5eba372..89ef591e4d 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,8 @@ **/dist/* *.zip CodenameOneDesigner/src/version.properties +*-storage-state.json +*-storage-state.*.json /Ports/iOSPort/build/ /Ports/iOSPort/dist/ Ports/iOSPort/nbproject/private/private.xml diff --git a/docs/website/reports/syndication-screenshots/.gitignore b/docs/website/reports/syndication-screenshots/.gitignore new file mode 100644 index 0000000000..5b502f9bd1 --- /dev/null +++ b/docs/website/reports/syndication-screenshots/.gitignore @@ -0,0 +1,2 @@ +*.png +!.gitignore diff --git a/scripts/website/export_storage_state.py b/scripts/website/export_storage_state.py new file mode 100755 index 0000000000..9f0500dce4 --- /dev/null +++ b/scripts/website/export_storage_state.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +"""Export a logged-in browser session for syndication targets that block +password-based automation (Medium has no password login at all; DZone +gates its login form behind invisible reCAPTCHA). + +Two paths: + + --from-firefox-profile read cookies straight from your existing Firefox + profile's cookies.sqlite (no second login) + --browser {chrome,...} launch Playwright with the chosen browser, open + the site's signin page, poll for auth cookies + +Output is a Playwright storageState JSON written to disk and (unless +--no-base64) a base64 blob ready to paste as the {SITE}_STORAGE_STATE +repo secret consumed by syndicate_browser_posts.py. + +Examples: + + python3 scripts/website/export_storage_state.py --site medium --from-firefox-profile + python3 scripts/website/export_storage_state.py --site dzone --browser firefox +""" + +from __future__ import annotations + +import argparse +import base64 +import glob +import json +import shutil +import sqlite3 +import sys +import tempfile +import time +from pathlib import Path + + +DEFAULT_OUTPUT = Path("medium-storage-state.json") +DEFAULT_TIMEOUT_SECONDS = 600 # 10 minutes for the user to complete login + +# Per-target site profile. Each entry knows where to land in a launched browser, +# which cookie domain to filter from a Firefox profile, and how to recognize +# a logged-in session (a function over the captured cookie list). +SITE_PROFILES: dict[str, dict] = { + "medium": { + "signin_url": "https://medium.com/m/signin", + "cookie_host_glob": "%medium.com", + # Medium assigns every visitor a `uid` cookie. Anonymous visitors get a + # value prefixed with `lo_`; a signed-in user gets one without it. + "is_logged_in": lambda cookies: any( + c.get("name") == "uid" and not (c.get("value") or "").startswith("lo_") + for c in cookies + ), + }, + "dzone": { + "signin_url": "https://dzone.com/users/login.html", + "cookie_host_glob": "%dzone.com", + # DZone uses Spring Security's `remember-me` cookie for long-lived auth + # plus a per-session `dz` cookie. Either one signals a logged-in + # session. + "is_logged_in": lambda cookies: any( + c.get("name") == "remember-me" or (c.get("name") or "").startswith("dz") + and (c.get("name") or "") not in ("dzuuid",) # dzuuid is anonymous + for c in cookies + ), + }, +} + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--site", choices=sorted(SITE_PROFILES), default="medium", + help="Which target site to capture a session for (default: medium).") + parser.add_argument("--output", default=None, + help="Path to write the storage state JSON (default: -storage-state.json)") + parser.add_argument("--no-base64", action="store_true", + help="Skip printing the base64 blob (just write the JSON file).") + parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT_SECONDS, + help="Maximum seconds to wait for login completion (default: 600).") + parser.add_argument("--interactive", action="store_true", + help="Wait for Enter on stdin instead of polling for auth cookies.") + parser.add_argument("--browser", default="chrome", choices=["chrome", "chromium", "firefox", "msedge"], + help="Which Playwright browser to launch (default: chrome).") + parser.add_argument("--from-firefox-profile", nargs="?", const="auto", default=None, + help=("Skip launching a browser and instead read medium.com cookies from an " + "existing Firefox profile's cookies.sqlite. Pass a path or omit for auto-detect.")) + return parser.parse_args(argv) + + +def _locate_firefox_profile(explicit: str | None) -> Path: + if explicit and explicit != "auto": + path = Path(explicit).expanduser() + if path.is_file(): + return path + if path.is_dir(): + candidate = path / "cookies.sqlite" + if candidate.is_file(): + return candidate + raise RuntimeError(f"Firefox cookies.sqlite not found at {path}") + # Auto-detect macOS Firefox profile. + base = Path.home() / "Library" / "Application Support" / "Firefox" / "Profiles" + if not base.exists(): + # Linux / other-OS fallbacks. + for guess in (Path.home() / ".mozilla" / "firefox", Path.home() / "snap" / "firefox" / "common" / ".mozilla" / "firefox"): + if guess.exists(): + base = guess + break + if not base.exists(): + raise RuntimeError("Could not locate a Firefox profiles directory.") + candidates = sorted(glob.glob(str(base / "*default*" / "cookies.sqlite"))) or sorted( + glob.glob(str(base / "*" / "cookies.sqlite")) + ) + if not candidates: + raise RuntimeError(f"No cookies.sqlite found under {base}") + # Prefer the most recently modified profile. + return Path(max(candidates, key=lambda p: Path(p).stat().st_mtime)) + + +def _firefox_storage_state(cookies_db: Path, host_glob: str) -> dict: + # Copy to a temp file because Firefox holds a write lock on the live DB. + with tempfile.NamedTemporaryFile(suffix=".sqlite", delete=False) as tmp: + tmp_path = Path(tmp.name) + shutil.copy2(cookies_db, tmp_path) + try: + conn = sqlite3.connect(f"file:{tmp_path}?mode=ro", uri=True) + cur = conn.execute( + "SELECT name, value, host, path, expiry, isSecure, isHttpOnly, sameSite " + "FROM moz_cookies WHERE host LIKE ?", + (host_glob,), + ) + rows = cur.fetchall() + conn.close() + finally: + tmp_path.unlink(missing_ok=True) + samesite_map = {0: "None", 1: "Lax", 2: "Strict"} + cookies = [] + for name, value, host, path, expiry, is_secure, is_http_only, same_site in rows: + cookies.append({ + "name": name, + "value": value, + "domain": host if host.startswith(".") else "." + host, + "path": path or "/", + "expires": _normalize_expiry(expiry), + "httpOnly": bool(is_http_only), + "secure": bool(is_secure), + "sameSite": samesite_map.get(int(same_site or 0), "None"), + }) + return {"cookies": cookies, "origins": []} + + +def _normalize_expiry(raw: float | int | None) -> float: + """Coerce a Firefox cookies.sqlite expiry into a Playwright-acceptable value. + + Playwright wants seconds-since-epoch (positive number) or -1 for session. + Firefox stores `expiry` in seconds in older code but in milliseconds in + newer entries. Anything past ~year 5138 must be milliseconds — divide. + """ + if not raw: + return -1.0 + value = float(raw) + if value > 1e11: # > ~year 5138 in seconds; treat as milliseconds. + value = value / 1000.0 + return value + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + profile = SITE_PROFILES[args.site] + output_path = Path(args.output or f"{args.site}-storage-state.json").resolve() + secret_name = f"{args.site.upper()}_STORAGE_STATE" + + if args.from_firefox_profile is not None: + try: + cookies_db = _locate_firefox_profile(args.from_firefox_profile) + except RuntimeError as err: + print(f"Error: {err}", file=sys.stderr) + return 1 + print(f"Reading {args.site} cookies from Firefox profile: {cookies_db}") + state = _firefox_storage_state(cookies_db, profile["cookie_host_glob"]) + if not profile["is_logged_in"](state["cookies"]): + print(f"Error: this Firefox profile does not appear to be logged in to {args.site}.", + file=sys.stderr) + return 1 + output_path.write_text(json.dumps(state), encoding="utf-8") + print(f"Wrote storage state: {output_path}") + print(f" cookies captured: {len(state['cookies'])}") + if not args.no_base64: + encoded = base64.b64encode(output_path.read_bytes()).decode("ascii") + print() + print(f"Paste the following as the {secret_name} repository secret:") + print("-" * 72) + print(encoded) + print("-" * 72) + return 0 + + try: + from playwright.sync_api import sync_playwright + except ImportError: + print("Playwright is not installed. In a venv, run: pip install playwright && playwright install chromium", + file=sys.stderr) + return 1 + + with sync_playwright() as pw: + launch_kwargs: dict = {"headless": False} + # The args namespace renamed channel to browser to allow Firefox. + browser_choice = args.browser + if browser_choice == "firefox": + try: + browser = pw.firefox.launch(headless=False) + except Exception as err: # noqa: BLE001 + print(f"Could not launch Playwright Firefox ({err}). " + "Run `playwright install firefox` and retry.", file=sys.stderr) + return 1 + else: + if browser_choice and browser_choice != "chromium": + launch_kwargs["channel"] = browser_choice + try: + browser = pw.chromium.launch(**launch_kwargs) + except Exception as err: # noqa: BLE001 — channel may not be installed + print(f"Could not launch with browser='{browser_choice}' ({err}); falling back to bundled Chromium.", + file=sys.stderr) + browser = pw.chromium.launch(headless=False) + + context = browser.new_context( + viewport={"width": 1280, "height": 900}, + user_agent=( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" + ), + ) + page = context.new_page() + page.goto(profile["signin_url"]) + + print() + print("=" * 72) + print(f"A browser window has opened on {args.site}'s sign-in page.") + print("Log in (Google / email / whatever you normally use).") + if args.interactive: + print("When you can see your Medium home or profile, return here and press Enter.") + else: + print(f"The script will detect login automatically (waiting up to {args.timeout}s).") + print("=" * 72) + sys.stdout.flush() + + if args.interactive: + try: + input("Press Enter once you are logged in… ") + except (KeyboardInterrupt, EOFError): + print("Aborted.", file=sys.stderr) + browser.close() + return 1 + else: + deadline = time.time() + args.timeout + detected = False + while time.time() < deadline: + if profile["is_logged_in"](context.cookies(profile["signin_url"])): + detected = True + break + time.sleep(3) + if not detected: + print(f"Timed out waiting for {args.site} login — auth cookies not detected.", + file=sys.stderr) + browser.close() + return 1 + print("Logged-in cookies detected — capturing session state…") + # Give the site a couple seconds to finish setting localStorage. + time.sleep(3) + + state = context.storage_state() + output_path.write_text(json.dumps(state), encoding="utf-8") + browser.close() + + print() + print(f"Wrote storage state: {output_path}") + print(f" cookies captured: {len(state.get('cookies', []))}") + print(f" origins with localStorage: {len(state.get('origins', []))}") + + if args.no_base64: + return 0 + + encoded = base64.b64encode(output_path.read_bytes()).decode("ascii") + print() + print(f"Paste the following as the {secret_name} repository secret:") + print("-" * 72) + print(encoded) + print("-" * 72) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/website/syndicate_blog_posts.py b/scripts/website/syndicate_blog_posts.py new file mode 100755 index 0000000000..f20acb7008 --- /dev/null +++ b/scripts/website/syndicate_blog_posts.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python3 +"""Syndicate Codename One Hugo blog posts to dev.to and Hashnode. + +Selects the oldest blog post under ``docs/website/content/blog`` that: + +* has a ``date`` strictly after the eligibility floor (default: 2026-04-30), +* is at least ``--min-age-days`` old (default: 7), +* has not yet been syndicated to a given target platform. + +For each unsyndicated platform on the chosen post the script POSTs the +content with ``canonical_url`` pointing back at the original on +``www.codenameone.com`` and records the resulting URL / id in +``scripts/website/syndication-state.json``. + +Designed to run from a daily GitHub Action with only the Python standard +library available. +""" + +from __future__ import annotations + +import argparse +import datetime as dt +import json +import os +import re +import sys +import urllib.error +import urllib.request +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +REPO_ROOT = Path(__file__).resolve().parents[2] +BLOG_DIR = REPO_ROOT / "docs" / "website" / "content" / "blog" +STATE_FILE = REPO_ROOT / "scripts" / "website" / "syndication-state.json" +SITE_BASE_URL = "https://www.codenameone.com" + +ELIGIBILITY_FLOOR = dt.date(2026, 4, 30) # posts must be strictly newer than this +MIN_AGE_DAYS = 7 + +CN1_BLURB = ( + '' +) + +# Hugo-only tail blocks that should not be syndicated. +_HUGO_FOOTER_RE = re.compile(r"\n\s*---\s*\n+##\s*Discussion\b.*\Z", re.DOTALL | re.IGNORECASE) +_HUGO_SHORTCODE_RE = re.compile(r"\{\{<[^>]*>\}\}|\{\{%[^%]*%\}\}") + +DEVTO_TAGS = ["java", "mobile", "android", "ios"] +HASHNODE_TAGS = [ + {"slug": "java", "name": "Java"}, + {"slug": "mobile", "name": "Mobile"}, + {"slug": "android", "name": "Android"}, + {"slug": "ios", "name": "iOS"}, +] + +DEFAULT_PLATFORMS = "devto,hashnode" + + +@dataclass +class Post: + path: Path + slug: str + title: str + date: dt.date + front_matter: dict[str, Any] + body: str + + @property + def canonical_url(self) -> str: + url_field = self.front_matter.get("url") + if isinstance(url_field, str) and url_field.startswith("/"): + return f"{SITE_BASE_URL}{url_field}" + return f"{SITE_BASE_URL}/blog/{self.slug}/" + + @property + def cover_image(self) -> str | None: + match = re.search(r"!\[[^\]]*\]\((/blog/[^)\s]+)\)", self.body) + if match: + return f"{SITE_BASE_URL}{match.group(1)}" + return None + + +@dataclass +class State: + raw: dict[str, Any] = field(default_factory=dict) + + @classmethod + def load(cls, path: Path) -> "State": + if not path.exists(): + return cls(raw={"posts": {}}) + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + if "posts" not in data or not isinstance(data["posts"], dict): + data["posts"] = {} + return cls(raw=data) + + def save(self, path: Path) -> None: + ordered = {key: self.raw[key] for key in ("_comment", "posts") if key in self.raw} + for key, value in self.raw.items(): + if key not in ordered: + ordered[key] = value + with path.open("w", encoding="utf-8") as handle: + json.dump(ordered, handle, indent=2, sort_keys=False) + handle.write("\n") + + def is_syndicated(self, slug: str, platform: str) -> bool: + post = self.raw["posts"].get(slug) + if not post: + return False + entry = post.get(platform) + return bool(entry and entry.get("url")) + + def record(self, slug: str, platform: str, payload: dict[str, Any]) -> None: + post = self.raw["posts"].setdefault(slug, {}) + post[platform] = payload + + +def parse_front_matter(text: str) -> tuple[dict[str, Any], str]: + """Parse the small subset of YAML front matter the blog uses. + + The site's posts use simple ``key: value`` pairs (no nesting, no lists), + so a hand-rolled parser keeps this script dependency-free. + """ + if not text.startswith("---\n"): + raise ValueError("missing front matter") + end = text.find("\n---\n", 4) + if end == -1: + raise ValueError("unterminated front matter") + block = text[4:end] + body = text[end + len("\n---\n") :] + + fm: dict[str, Any] = {} + current_key: str | None = None + current_lines: list[str] | None = None + + for raw_line in block.splitlines(): + if current_key is not None and (raw_line.startswith(" ") or raw_line.startswith("\t") or raw_line == ""): + current_lines.append(raw_line) + continue + if current_lines is not None and current_key is not None: + fm[current_key] = _coerce_scalar("\n".join(current_lines).strip()) + current_key = None + current_lines = None + + match = re.match(r"^([A-Za-z0-9_]+):\s*(.*)$", raw_line) + if not match: + continue + key, value = match.group(1), match.group(2) + if value == "": + current_key = key + current_lines = [] + else: + fm[key] = _coerce_scalar(value) + + if current_lines is not None and current_key is not None: + fm[current_key] = _coerce_scalar("\n".join(current_lines).strip()) + + return fm, body + + +def _coerce_scalar(value: str) -> Any: + stripped = value.strip() + if len(stripped) >= 2 and stripped[0] == stripped[-1] and stripped[0] in ("'", '"'): + inner = stripped[1:-1] + if stripped[0] == "'": + inner = inner.replace("''", "'") + return inner + if stripped.lower() in ("true", "false"): + return stripped.lower() == "true" + return stripped + + +def parse_post(path: Path) -> Post | None: + text = path.read_text(encoding="utf-8") + try: + fm, body = parse_front_matter(text) + except ValueError: + return None + date_value = fm.get("date") + if not isinstance(date_value, str): + return None + try: + date = dt.date.fromisoformat(date_value[:10]) + except ValueError: + return None + slug = fm.get("slug") or path.stem + title = fm.get("title") or slug + return Post(path=path, slug=slug, title=str(title), date=date, front_matter=fm, body=body) + + +def discover_posts(blog_dir: Path) -> list[Post]: + posts: list[Post] = [] + for path in sorted(blog_dir.glob("*.md")): + if path.name.startswith("_"): + continue + post = parse_post(path) + if post is not None: + posts.append(post) + posts.sort(key=lambda p: p.date) + return posts + + +def select_candidate( + posts: list[Post], + state: State, + platforms: list[str], + today: dt.date, + floor: dt.date, + min_age_days: int, +) -> Post | None: + cutoff = today - dt.timedelta(days=min_age_days) + for post in posts: + if post.date <= floor: + continue + if post.date > cutoff: + continue + if all(state.is_syndicated(post.slug, p) for p in platforms): + continue + return post + return None + + +_RELATIVE_LINK_RE = re.compile(r"(\]\()(/[^)\s]+)(\))") +_RELATIVE_IMG_RE = re.compile(r'(]*src=["\'])(/[^"\']+)(["\'])', re.IGNORECASE) + + +def absolutize_links(body: str) -> str: + body = _RELATIVE_LINK_RE.sub(lambda m: f"{m.group(1)}{SITE_BASE_URL}{m.group(2)}{m.group(3)}", body) + body = _RELATIVE_IMG_RE.sub(lambda m: f"{m.group(1)}{SITE_BASE_URL}{m.group(2)}{m.group(3)}", body) + return body + + +def insert_blurb(body: str, blurb: str) -> str: + """Insert ``blurb`` after the first non-image paragraph (i.e. after the fold).""" + lines = body.split("\n") + n = len(lines) + i = 0 + # skip leading blank lines + while i < n and lines[i].strip() == "": + i += 1 + # skip a leading header image (a paragraph that is just a markdown image) + if i < n and re.match(r"^!\[[^\]]*\]\([^)]+\)\s*$", lines[i].strip()): + i += 1 + while i < n and lines[i].strip() == "": + i += 1 + # skip the first paragraph of body text + while i < n and lines[i].strip() != "": + i += 1 + # i now points at the blank line (or EOF) following the first text paragraph + insertion = ["", blurb, ""] + return "\n".join(lines[:i] + insertion + lines[i:]) + + +def render_syndicated_body(post: Post) -> str: + body = post.body.strip("\n") + body = _HUGO_FOOTER_RE.sub("", body) + body = _HUGO_SHORTCODE_RE.sub("", body).rstrip() + body = absolutize_links(body) + body = insert_blurb(body, CN1_BLURB) + return body + + +USER_AGENT = "CodenameOneBlogSyndicator/1.0 (+https://github.com/codenameone/CodenameOne)" + + +def http_post_json(url: str, headers: dict[str, str], payload: dict[str, Any]) -> dict[str, Any]: + data = json.dumps(payload).encode("utf-8") + request = urllib.request.Request(url, data=data, method="POST") + request.add_header("Content-Type", "application/json") + request.add_header("User-Agent", USER_AGENT) + request.add_header("Accept", "application/json") + for key, value in headers.items(): + request.add_header(key, value) + try: + with urllib.request.urlopen(request, timeout=60) as response: + body = response.read().decode("utf-8") + except urllib.error.HTTPError as err: + detail = err.read().decode("utf-8", errors="replace") + raise RuntimeError(f"{url} returned HTTP {err.code}: {detail}") from err + if not body: + return {} + return json.loads(body) + + +def publish_to_devto(post: Post, body_markdown: str, api_key: str, draft: bool = False) -> dict[str, Any]: + payload: dict[str, Any] = { + "article": { + "title": post.title, + "body_markdown": body_markdown, + "published": not draft, + "canonical_url": post.canonical_url, + "tags": DEVTO_TAGS, + "description": str(post.front_matter.get("description") or "")[:250] or None, + } + } + cover = post.cover_image + if cover: + payload["article"]["main_image"] = cover + payload["article"] = {k: v for k, v in payload["article"].items() if v is not None} + + response = http_post_json( + "https://dev.to/api/articles", + headers={"api-key": api_key, "Accept": "application/vnd.forem.api-v1+json"}, + payload=payload, + ) + article_id = response.get("id") + # The URL field on dev.to returns the public canonical URL of the article, + # but for unpublished drafts that URL 404s for anyone who is not the author. + # In draft mode point users at the dashboard, where the draft is editable. + if draft and article_id: + url = f"https://dev.to/dashboard/{article_id}/edit" + else: + url = response.get("url") or response.get("canonical_url") + return { + "id": article_id, + "url": url, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + +def publish_to_hashnode(post: Post, body_markdown: str, token: str, publication_id: str, + draft: bool = False) -> dict[str, Any]: + if draft: + mutation = """ + mutation CreateDraft($input: CreateDraftInput!) { + createDraft(input: $input) { + draft { id slug } + } + } + """.strip() + else: + mutation = """ + mutation PublishPost($input: PublishPostInput!) { + publishPost(input: $input) { + post { id slug url } + } + } + """.strip() + + input_obj: dict[str, Any] = { + "title": post.title, + "contentMarkdown": body_markdown, + "publicationId": publication_id, + "tags": HASHNODE_TAGS, + "originalArticleURL": post.canonical_url, + } + cover = post.cover_image + if cover: + input_obj["coverImageOptions"] = {"coverImageURL": cover} + subtitle = str(post.front_matter.get("description") or "").strip() + if subtitle: + input_obj["subtitle"] = subtitle[:250] + + response = http_post_json( + "https://gql.hashnode.com", + headers={"Authorization": token}, + payload={"query": mutation, "variables": {"input": input_obj}}, + ) + if response.get("errors"): + raise RuntimeError(f"hashnode GraphQL errors: {response['errors']}") + data = response.get("data") or {} + if draft: + node = data.get("createDraft", {}).get("draft", {}) + slug = node.get("slug") + url = f"https://hashnode.com/draft/{node.get('id')}" if node.get("id") else None + else: + node = data.get("publishPost", {}).get("post", {}) + url = node.get("url") + return { + "id": node.get("id"), + "url": url, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true", help="Do not call any APIs; print what would happen.") + parser.add_argument("--draft-mode", action="store_true", + help="Create as draft (dev.to: published=false; Hashnode: createDraft) instead of publishing. Useful for verifying formatting without going live.") + parser.add_argument( + "--platforms", + default=DEFAULT_PLATFORMS, + help=f"Comma-separated subset of platforms to consider (default: {DEFAULT_PLATFORMS}).", + ) + parser.add_argument( + "--today", + default=None, + help="Override today's date (YYYY-MM-DD). Useful for testing.", + ) + parser.add_argument( + "--floor", + default=ELIGIBILITY_FLOOR.isoformat(), + help=f"Posts must be dated strictly after this date (default: {ELIGIBILITY_FLOOR.isoformat()}).", + ) + parser.add_argument( + "--min-age-days", + type=int, + default=MIN_AGE_DAYS, + help=f"Minimum post age in days before syndicating (default: {MIN_AGE_DAYS}).", + ) + parser.add_argument( + "--blog-dir", + default=str(BLOG_DIR), + help="Directory containing Hugo blog posts.", + ) + parser.add_argument( + "--state-file", + default=str(STATE_FILE), + help="Path to syndication state JSON.", + ) + return parser.parse_args(argv) + + +def is_platform_configured(platform: str) -> bool: + if platform == "devto": + return bool(os.environ.get("DEVTO_API_KEY")) + if platform == "hashnode": + return bool(os.environ.get("HASHNODE_TOKEN") and os.environ.get("HASHNODE_PUBLICATION_ID")) + return False + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + today = dt.date.fromisoformat(args.today) if args.today else dt.date.today() + floor = dt.date.fromisoformat(args.floor) + requested_platforms = [p.strip() for p in args.platforms.split(",") if p.strip()] + blog_dir = Path(args.blog_dir) + state_file = Path(args.state_file) + + if args.dry_run: + platforms = requested_platforms + else: + platforms = [] + for platform in requested_platforms: + if is_platform_configured(platform): + platforms.append(platform) + else: + # Skipping an unconfigured platform here (instead of failing) keeps + # the candidate selector from getting stuck on a post that can never + # be fully syndicated. Once the missing creds appear, the next run + # picks up where this one left off. + print(f"[{platform}] credentials not configured; skipping platform.") + + if not platforms: + print("No platforms are configured; nothing to do.") + return 0 + + posts = discover_posts(blog_dir) + state = State.load(state_file) + candidate = select_candidate(posts, state, platforms, today, floor, args.min_age_days) + if candidate is None: + print("No syndication candidate found today.") + return 0 + + print(f"Selected post: {candidate.slug} (date={candidate.date.isoformat()})") + body_markdown = render_syndicated_body(candidate) + + any_change = False + failures: list[str] = [] + + for platform in platforms: + if state.is_syndicated(candidate.slug, platform): + print(f" [{platform}] already syndicated; skipping.") + continue + if args.dry_run: + print(f" [{platform}] dry run — would publish {len(body_markdown)} chars, canonical {candidate.canonical_url}") + continue + try: + if platform == "devto": + result = publish_to_devto( + candidate, body_markdown, os.environ["DEVTO_API_KEY"], + draft=args.draft_mode, + ) + elif platform == "hashnode": + result = publish_to_hashnode( + candidate, + body_markdown, + os.environ["HASHNODE_TOKEN"], + os.environ["HASHNODE_PUBLICATION_ID"], + draft=args.draft_mode, + ) + else: + raise RuntimeError(f"unknown platform: {platform}") + except Exception as err: # noqa: BLE001 — surface any failure as per-platform + print(f" [{platform}] FAILED: {err}", file=sys.stderr) + failures.append(platform) + continue + + if not result.get("url"): + print(f" [{platform}] response missing URL: {result}", file=sys.stderr) + failures.append(platform) + continue + + state.record(candidate.slug, platform, result) + any_change = True + print(f" [{platform}] published: {result['url']}") + + if any_change: + state.save(state_file) + print(f"Updated state file: {state_file}") + + if failures: + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/website/syndicate_browser_posts.py b/scripts/website/syndicate_browser_posts.py new file mode 100755 index 0000000000..a333b152ae --- /dev/null +++ b/scripts/website/syndicate_browser_posts.py @@ -0,0 +1,948 @@ +#!/usr/bin/env python3 +"""Syndicate Codename One Hugo blog posts to sites that have no usable API. + +Counterpart to ``syndicate_blog_posts.py``: instead of POSTing to a REST/ +GraphQL endpoint, this script drives a real (headless) browser via Playwright +and submits the post through the site's normal authoring UI as a draft for +editorial review. Shares ``Post`` discovery, body rendering, and the +``syndication-state.json`` state file with the API-based script. + +Adapters (one class per target site) live at the bottom of this file. Each +adapter exposes a ``login()`` and a ``submit_draft()`` step. Selectors are +kept as constants at the top of each adapter so they are easy to update when +the site changes its UI — which it will, so plan on it. + +Usage: + + # First-time setup, watch the browser, take screenshots of the editor: + python3 scripts/website/syndicate_browser_posts.py \ + --platforms foojay --validate-only --headed --today 2026-05-08 + + # Real syndication (headless, daily-cron style): + python3 scripts/website/syndicate_browser_posts.py --platforms foojay,hackernoon + +Required env vars per platform (script auto-skips a platform when its creds +are missing, just like the API script): + + foojay : FOOJAY_USER, FOOJAY_PASSWORD + hackernoon : HACKERNOON_USER, HACKERNOON_PASSWORD + dzone : DZONE_USER, DZONE_PASSWORD + medium : MEDIUM_STORAGE_STATE (base64-encoded Playwright storageState + JSON exported from a logged-in session + — Medium has no password login flow) +""" + +from __future__ import annotations + +import argparse +import base64 +import datetime as dt +import json +import os +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable + +# Reuse the API-based script's discovery, body rendering, and state machinery. +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from syndicate_blog_posts import ( # noqa: E402 (intentional path injection) + BLOG_DIR, + ELIGIBILITY_FLOOR, + MIN_AGE_DAYS, + Post, + STATE_FILE, + State, + discover_posts, + render_syndicated_body, + select_candidate, +) + + +SCREENSHOT_DIR = Path(__file__).resolve().parents[2] / "docs" / "website" / "reports" / "syndication-screenshots" +DEFAULT_PLATFORMS = "foojay,hackernoon,dzone,medium" + +_UA_STR = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +) + + +@dataclass +class AdapterContext: + post: Post + body_markdown: str + headed: bool + validate_only: bool + + +# --------------------------------------------------------------------------- # +# Adapters # +# --------------------------------------------------------------------------- # + + +class AdapterError(RuntimeError): + """Raised when an adapter cannot complete its flow.""" + + +def _find_first(page, selectors: list[str], *, timeout: int = 15000): + """Try each selector in turn; return the first that becomes visible. + + Adapters list multiple plausible selectors per field so a small UI tweak + on the target site does not break the run. The first match wins. + """ + last_error: Exception | None = None + for selector in selectors: + try: + handle = page.wait_for_selector(selector, timeout=timeout, state="visible") + if handle: + return handle + except Exception as err: # noqa: BLE001 — Playwright TimeoutError, etc. + last_error = err + continue + raise AdapterError(f"none of the selectors matched: {selectors}: {last_error}") + + +def _escape_html(text: str) -> str: + return (text.replace("&", "&").replace("<", "<").replace(">", ">")) + + +def _download_to_temp(url: str) -> Path: + """Download a remote file to a tempfile and return the local path.""" + import tempfile + import urllib.request as _ur + req = _ur.Request(url, headers={"User-Agent": _UA_STR}) + with _ur.urlopen(req, timeout=120) as resp: + data = resp.read() + suffix = Path(url.split("?", 1)[0]).suffix or ".jpg" + fd, name = tempfile.mkstemp(suffix=suffix) + with os.fdopen(fd, "wb") as out: + out.write(data) + return Path(name) + + +def _markdown_to_html(text: str) -> str: + """Render a Hugo-flavoured markdown post to HTML for paste-into-editor. + + Falls back to
-wrapped escaped text if python-markdown is unavailable.
+    """
+    try:
+        import markdown as _md
+    except ImportError:
+        return f"
{_escape_html(text)}
" + return _md.markdown(text, extensions=["extra", "fenced_code", "sane_lists"], output_format="html5") + + +def _trim_for_meta_description(text: str, limit: int = 140) -> str: + """Trim a description to Yoast's preferred meta-description length, on a word boundary.""" + text = (text or "").strip() + if len(text) <= limit: + return text + truncated = text[:limit].rsplit(" ", 1)[0].rstrip(",.;:") + return truncated + "…" + + +def _load_base64_storage_state(env_var: str) -> Path: + """Decode a base64-encoded storage_state JSON from an env var to a temp file.""" + encoded = os.environ[env_var] + decoded = base64.b64decode(encoded) + path = Path(f"/tmp/{env_var.lower()}.json") + path.write_bytes(decoded) + return path + + +def _save_screenshot(page, slug: str, label: str) -> Path: + SCREENSHOT_DIR.mkdir(parents=True, exist_ok=True) + stamp = dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") + path = SCREENSHOT_DIR / f"{slug}-{label}-{stamp}.png" + try: + page.screenshot(path=str(path), full_page=True) + except Exception: # noqa: BLE001 — never let a screenshot failure mask the real error + return path + return path + + +class FoojayAdapter: + """foojay.io — Playwright login + REST API draft creation. + + Pure UI submission to foojay does not work reliably: Cloudflare in front + of foojay challenges form POSTs to /wp-admin/post.php and drops the + form payload during the challenge, so the draft is never created. The + REST API is not subject to the same challenge, but Wordfence has + Application Passwords disabled, so token auth is also out. + + The working hybrid: drive wp-login.php with Playwright to obtain a real + user session (cookies), pull the WP REST nonce from /wp-admin/, then + POST the draft through /wp-json/wp/v2/posts with cookie + X-WP-Nonce + auth. Behaves "as a website user" end-to-end while sidestepping both + the app-password block and the Cloudflare POST challenge. + """ + + name = "foojay" + LOGIN_URL = "https://foojay.io/wp-login.php" + BASE_URL = "https://foojay.io" + REST_POSTS_ENDPOINT = "https://foojay.io/wp-json/wp/v2/posts" + REST_TAGS_ENDPOINT = "https://foojay.io/wp-json/wp/v2/tags" + REST_MEDIA_ENDPOINT = "https://foojay.io/wp-json/wp/v2/media" + XMLRPC_ENDPOINT = "https://foojay.io/xmlrpc.php" + + # Pre-resolved category and tag IDs (from /wp-json/wp/v2/categories?search=java + # and /wp-json/wp/v2/tags?slug=codenameone). The tag is created lazily on + # first use if it does not yet exist. + JAVA_CATEGORY_ID = 1722 + CODENAMEONE_TAG_SLUG = "codenameone" + CODENAMEONE_TAG_NAME = "Codename One" + + USER_SELECTORS = ["#user_login"] + PASSWORD_SELECTORS = ["#user_pass"] + SUBMIT_SELECTORS = ["#wp-submit"] + + @staticmethod + def is_configured() -> bool: + return bool(os.environ.get("FOOJAY_USER") and os.environ.get("FOOJAY_PASSWORD")) + + def login(self, page) -> None: + page.goto(self.LOGIN_URL, wait_until="domcontentloaded") + _find_first(page, self.USER_SELECTORS).fill(os.environ["FOOJAY_USER"]) + _find_first(page, self.PASSWORD_SELECTORS).fill(os.environ["FOOJAY_PASSWORD"]) + _find_first(page, self.SUBMIT_SELECTORS).click() + try: + page.wait_for_url("**/wp-admin/**", timeout=90000) + except Exception: # noqa: BLE001 + page.wait_for_selector("#wpadminbar", timeout=30000) + + def submit_draft(self, page, ctx: AdapterContext) -> dict[str, Any]: + # Land on wp-admin so wpApiSettings (which carries the nonce) is in scope. + page.goto("https://foojay.io/wp-admin/", wait_until="domcontentloaded", timeout=60000) + nonce = page.evaluate( + "() => (window.wpApiSettings && window.wpApiSettings.nonce) || null" + ) + if not nonce: + raise AdapterError("could not extract wpApiSettings.nonce from /wp-admin/") + + if ctx.validate_only: + shot = _save_screenshot(page, ctx.post.slug, "foojay-editor") + return {"validated": True, "screenshot": str(shot), "nonce_acquired": True} + + cookies = page.context.cookies("https://foojay.io/") + cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies) + + # Resolve / create the codenameone tag. + tag_id = self._ensure_tag(cookie_header, nonce) + # Upload the cover image into the WP media library and use the + # returned media ID as the post's featured image. + featured_media_id: int | None = None + if ctx.post.cover_image: + try: + featured_media_id = self._upload_featured_media( + cookie_header, nonce, ctx.post.cover_image, ctx.post.title + ) + except Exception as err: # noqa: BLE001 — featured image is best-effort + print(f" [foojay] featured image upload failed (non-fatal): {err}", file=sys.stderr) + + # Yoast canonical (_yoast_wpseo_canonical) is not registered for REST + # writes on this Yoast install. We send it in `meta` regardless (it's + # silently ignored if rejected, accepted if registered) AND surface it + # as a hidden HTML comment at the top of the body so the editor can + # spot the original URL when filling Yoast's metabox. + excerpt = str(ctx.post.front_matter.get("description") or "").strip() + canonical_prefix = f"\n\n" + + payload: dict[str, Any] = { + "title": ctx.post.title, + "content": canonical_prefix + ctx.body_markdown, + "status": "draft", + "categories": [self.JAVA_CATEGORY_ID], + "tags": [tag_id] if tag_id else [], + "meta": { + "_yoast_wpseo_canonical": ctx.post.canonical_url, + "_yoast_wpseo_title": ctx.post.title, + "_yoast_wpseo_metadesc": excerpt[:155] if excerpt else "", + }, + } + if featured_media_id: + payload["featured_media"] = featured_media_id + if excerpt: + payload["excerpt"] = excerpt[:500] + + data = self._rest_post(self.REST_POSTS_ENDPOINT, cookie_header, nonce, payload) + post_id = data.get("id") + if not post_id: + raise AdapterError(f"REST response missing post id: {data}") + + # Yoast meta (canonical / SEO title / metadesc) is not REST-writable on + # foojay's Yoast install. wp-admin form-submit is blocked by Cloudflare. + # XML-RPC's wp.editPost with custom_fields bypasses both restrictions + # and successfully writes the underscore-prefixed meta keys. + yoast_set = False + try: + self._set_yoast_meta_via_xmlrpc( + post_id=post_id, + canonical=ctx.post.canonical_url, + seo_title=ctx.post.title, + metadesc=_trim_for_meta_description(excerpt), + ) + yoast_set = True + except Exception as err: # noqa: BLE001 — Yoast meta is best-effort + print(f" [foojay] XML-RPC Yoast meta write failed (non-fatal): {err}", file=sys.stderr) + + return { + "id": post_id, + "url": f"https://foojay.io/wp-admin/post.php?post={post_id}&action=edit", + "preview_url": data.get("link"), + "featured_media_id": featured_media_id, + "yoast_meta_set": yoast_set, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + # ----- helpers ----- + + def _ensure_tag(self, cookie_header: str, nonce: str) -> int | None: + """Return the WP tag id for `codenameone`, creating it if missing.""" + import urllib.parse as _up + try: + existing = self._rest_get( + f"{self.REST_TAGS_ENDPOINT}?slug={_up.quote(self.CODENAMEONE_TAG_SLUG)}", + cookie_header, + nonce, + ) + if isinstance(existing, list) and existing: + return existing[0].get("id") + created = self._rest_post( + self.REST_TAGS_ENDPOINT, + cookie_header, + nonce, + {"name": self.CODENAMEONE_TAG_NAME, "slug": self.CODENAMEONE_TAG_SLUG}, + ) + return created.get("id") + except Exception as err: # noqa: BLE001 — tag is best-effort + print(f" [foojay] tag resolve/create failed (non-fatal): {err}", file=sys.stderr) + return None + + def _upload_featured_media(self, cookie_header: str, nonce: str, + image_url: str, title: str) -> int: + """Download the cover image and POST it into WP's media library.""" + import urllib.request as _ur + # Download bytes + req = _ur.Request(image_url, headers={"User-Agent": _UA_STR}) + with _ur.urlopen(req, timeout=120) as resp: + image_bytes = resp.read() + content_type = resp.headers.get("Content-Type", "image/jpeg") + filename = image_url.rsplit("/", 1)[-1].split("?", 1)[0] or "cover.jpg" + + upload = _ur.Request(self.REST_MEDIA_ENDPOINT, data=image_bytes, method="POST") + upload.add_header("Content-Type", content_type) + upload.add_header("Content-Disposition", f'attachment; filename="{filename}"') + upload.add_header("X-WP-Nonce", nonce) + upload.add_header("Cookie", cookie_header) + upload.add_header("User-Agent", _UA_STR) + with _ur.urlopen(upload, timeout=120) as response: + data = json.loads(response.read().decode("utf-8")) + media_id = data.get("id") + if not media_id: + raise RuntimeError(f"media upload returned no id: {str(data)[:200]}") + # Set a friendlier title on the media item. + try: + self._rest_post( + f"{self.REST_MEDIA_ENDPOINT}/{media_id}", cookie_header, nonce, + {"title": title, "alt_text": title}, + ) + except Exception: # noqa: BLE001 — title is cosmetic + pass + return media_id + + def _set_yoast_meta_via_xmlrpc(self, post_id: int, canonical: str, + seo_title: str, metadesc: str) -> None: + """Update Yoast SEO post meta via XML-RPC's wp.editPost custom_fields. + + REST silently drops these meta keys (not registered for REST writes) + and the wp-admin form-submit path is challenged by Cloudflare. + XML-RPC accepts underscore-prefixed meta keys via custom_fields and + is not Cloudflare-protected on foojay. + """ + import urllib.error as _ue + import urllib.request as _ur + import xml.sax.saxutils as _su + + user = os.environ["FOOJAY_USER"] + pwd = os.environ["FOOJAY_PASSWORD"] + + def cf_member(key: str, value: str) -> str: + return ( + "" + f"key{_su.escape(key)}" + f"value{_su.escape(value)}" + "" + ) + + custom_fields_xml = "".join([ + cf_member("_yoast_wpseo_canonical", canonical), + cf_member("_yoast_wpseo_title", seo_title), + cf_member("_yoast_wpseo_metadesc", metadesc), + ]) + envelope = ( + '' + 'wp.editPost' + "1" + f"{_su.escape(user)}" + f"{_su.escape(pwd)}" + f"{int(post_id)}" + "" + f"custom_fields{custom_fields_xml}" + "" + "" + ) + req = _ur.Request( + self.XMLRPC_ENDPOINT, + data=envelope.encode("utf-8"), + method="POST", + ) + req.add_header("Content-Type", "text/xml") + req.add_header("User-Agent", _UA_STR) + try: + with _ur.urlopen(req, timeout=60) as response: + body = response.read().decode("utf-8", errors="replace") + except _ue.HTTPError as err: + detail = err.read().decode("utf-8", errors="replace") + raise RuntimeError(f"xmlrpc HTTP {err.code}: {detail}") from err + if "" in body: + raise RuntimeError(f"xmlrpc fault: {body[:500]}") + if "1" not in body: + raise RuntimeError(f"xmlrpc unexpected response: {body[:500]}") + + def _rest_get(self, url: str, cookie_header: str, nonce: str) -> Any: + import urllib.request as _ur + req = _ur.Request(url, method="GET") + req.add_header("Accept", "application/json") + req.add_header("X-WP-Nonce", nonce) + req.add_header("Cookie", cookie_header) + req.add_header("User-Agent", _UA_STR) + with _ur.urlopen(req, timeout=60) as response: + return json.loads(response.read().decode("utf-8")) + + def _rest_post(self, url: str, cookie_header: str, nonce: str, + payload: dict[str, Any]) -> dict[str, Any]: + import urllib.error as _ue + import urllib.request as _ur + req = _ur.Request(url, data=json.dumps(payload).encode("utf-8"), method="POST") + req.add_header("Content-Type", "application/json") + req.add_header("Accept", "application/json") + req.add_header("X-WP-Nonce", nonce) + req.add_header("Cookie", cookie_header) + req.add_header("User-Agent", _UA_STR) + try: + with _ur.urlopen(req, timeout=120) as response: + raw = response.read().decode("utf-8") + except _ue.HTTPError as err: + detail = err.read().decode("utf-8", errors="replace") + raise AdapterError(f"REST POST {url} failed HTTP {err.code}: {detail}") from err + return json.loads(raw) if raw else {} + + +class HackerNoonAdapter: + """HackerNoon — app.hackernoon.com email/password login + their own editor. + + Selectors below have NOT been validated against the live site. Run with + ``--validate-only --headed`` first and update them if the run fails. + """ + + name = "hackernoon" + HOME_URL = "https://hackernoon.com/" + NEW_DRAFT_URL = "https://hackernoon.com/new" + + # Login is via a drawer that opens when you click the header "Login" button + # on the public hackernoon.com pages — there is no standalone /login page + # that submits successfully (the visible /login form is decorative; the + # working form is in the drawer). + HEADER_LOGIN_BUTTON_SELECTORS = ["button:text-is('Login')"] + DRAWER_EMAIL_SELECTORS = ["input[type=email][placeholder='Email']"] + DRAWER_PASSWORD_SELECTORS = ["input[type=password][placeholder='Password']"] + DRAWER_SUBMIT_SELECTORS = ["button:text-is('Log In')"] + + # Editor — Quill-based. Reached via "Start Draft" button on /new which + # navigates to app.hackernoon.com/articles/new. + START_DRAFT_SELECTORS = ["button:text-is('Start Draft')"] + TITLE_SELECTORS = ["textarea[name='title'][placeholder='Title']"] + DESCRIPTION_SELECTORS = ["textarea[placeholder*='brief description' i]"] + BODY_QUILL_SELECTORS = ["div.ql-editor[contenteditable='true']"] + COVER_IMAGE_FILE_INPUT_SELECTORS = ["input[type=file][accept*='image']"] + # Story Settings drawer — for canonical / non-original-story flag. The + # `css-p9s3bq` class is shared by both Yes and No buttons in the drawer; + # the corresponding modal also has Yes/No buttons (class="negative") + # which would be ambiguous, so we restrict to the drawer styling. + NOT_ORIGINAL_NO_SELECTOR = "button.css-p9s3bq:text-is('No')" + CANONICAL_INPUT_SELECTORS = ["input.firstSeenAt", "input[placeholder='www.example.com/yourstory']"] + # Save creates a draft. Submit Story for Review! sends to editorial and + # only enables once additional fields (image, categories, tags) are set — + # the syndication script intentionally targets Save so the draft lands + # for the user to review, refine, then submit for editorial publish. + SAVE_DRAFT_SELECTORS = ["button:text-is('Save')"] + + @staticmethod + def is_configured() -> bool: + return bool(os.environ.get("HACKERNOON_USER") and os.environ.get("HACKERNOON_PASSWORD")) + + def login(self, page) -> None: + page.goto(self.HOME_URL, wait_until="domcontentloaded", timeout=30000) + # Dismiss the Iubenda cookie consent banner if it overlays the page. + try: + page.click(".iubenda-cs-accept-btn, .iubenda-cs-reject-btn", timeout=3000) + except Exception: # noqa: BLE001 + pass + # Open the login drawer via the header Login button. There may be + # multiple "Login" buttons on the page (header + footer); .first picks + # the visible header one. + page.locator(self.HEADER_LOGIN_BUTTON_SELECTORS[0]).first.click() + # Drawer is React-controlled; type per-character so React's onChange + # actually updates state instead of being silently ignored. + email = page.locator(self.DRAWER_EMAIL_SELECTORS[0]) + email.wait_for(state="visible", timeout=15000) + email.click() + email.press_sequentially(os.environ["HACKERNOON_USER"], delay=10) + pwd = page.locator(self.DRAWER_PASSWORD_SELECTORS[0]) + pwd.click() + pwd.press_sequentially(os.environ["HACKERNOON_PASSWORD"], delay=10) + page.locator(self.DRAWER_SUBMIT_SELECTORS[0]).click() + # Successful login sets a `hasAuthCookie` cookie on .hackernoon.com + # and the drawer disappears. Wait for the cookie rather than a URL + # change because the page may stay on the homepage. + deadline = dt.datetime.now() + dt.timedelta(seconds=30) + while dt.datetime.now() < deadline: + cookies = page.context.cookies("https://hackernoon.com/") + if any(c.get("name") == "hasAuthCookie" for c in cookies): + return + page.wait_for_timeout(500) + raise AdapterError("hackernoon login: hasAuthCookie not set within 30s") + + def submit_draft(self, page, ctx: AdapterContext) -> dict[str, Any]: + page.goto(self.NEW_DRAFT_URL, wait_until="networkidle", timeout=60000) + # SPA needs additional time after networkidle to hydrate the buttons. + start_locator = page.locator(self.START_DRAFT_SELECTORS[0]).first + start_locator.wait_for(state="visible", timeout=30000) + start_locator.click() + # Lands on app.hackernoon.com/articles/new — Quill needs a moment to mount. + page.wait_for_url("**/articles/**", timeout=30000) + page.wait_for_timeout(5000) + + # Title is a React-controlled textarea. .fill() leaves it empty, and + # press_sequentially with a small delay (5-10ms) drops leading chars + # because HN's React onChange debounces faster than the keystrokes. + # An 80ms-per-key delay is slow enough that every key registers. + title_field = page.locator(self.TITLE_SELECTORS[0]) + title_field.wait_for(state="visible", timeout=15000) + title_field.click() + title_field.press_sequentially(ctx.post.title, delay=80) + + # Description (used by HackerNoon as the SEO description / preview). + description = str(ctx.post.front_matter.get("description") or "").strip() + if description: + try: + desc = page.locator(self.DESCRIPTION_SELECTORS[0]) + desc.wait_for(state="visible", timeout=5000) + desc.click() + desc.press_sequentially(description[:300], delay=20) + except Exception: # noqa: BLE001 + pass + + # Body — Quill rich-text editor. Convert markdown to HTML so headings, + # images, links, and code fences render. Inject via Quill's clipboard + # API which translates HTML into Quill's Delta format. The visible + # canonical reference is set in Story Settings below, not in the body. + body_html = _markdown_to_html(ctx.body_markdown) + body = page.locator(self.BODY_QUILL_SELECTORS[0]) + body.wait_for(state="visible", timeout=15000) + body.click() + result = page.evaluate( + """(html) => { + const ce = document.querySelector("div.ql-editor[contenteditable='true']"); + if (!ce) return {via: 'none'}; + if (window.Quill && window.Quill.find) { + let container = ce.parentElement; + let q = null; + for (let i = 0; i < 5 && container; i++) { + q = window.Quill.find(container); + if (q) break; + container = container.parentElement; + } + if (q && q.clipboard && q.clipboard.dangerouslyPasteHTML) { + q.setText(''); + q.clipboard.dangerouslyPasteHTML(0, html, 'api'); + return {via: 'quill', length: q.getLength()}; + } + } + ce.innerHTML = html; + ce.dispatchEvent(new Event('input', {bubbles: true})); + return {via: 'fallback', length: ce.innerHTML.length}; + }""", + body_html, + ) + if result.get("via") == "none": + raise AdapterError("could not access Quill editor instance") + page.wait_for_timeout(1500) + + # Cover image — download from canonical to a temp file, upload via + # the file input that accepts image/*. set_input_files works without + # the file picker dialog actually opening. + if ctx.post.cover_image: + try: + cover_path = _download_to_temp(ctx.post.cover_image) + page.locator(self.COVER_IMAGE_FILE_INPUT_SELECTORS[0]).first.set_input_files(str(cover_path)) + page.wait_for_timeout(4000) # let the upload complete + except Exception as err: # noqa: BLE001 + print(f" [hackernoon] cover image upload failed (non-fatal): {err}", file=sys.stderr) + + # Story Settings drawer: tell HN this story is not original on + # HackerNoon and provide the canonical URL. + try: + no_btn = page.locator(self.NOT_ORIGINAL_NO_SELECTOR).first + no_btn.scroll_into_view_if_needed(timeout=10000) + no_btn.click() + page.wait_for_timeout(1500) + canonical_input = page.locator(self.CANONICAL_INPUT_SELECTORS[0]).first + canonical_input.scroll_into_view_if_needed(timeout=5000) + canonical_input.fill(ctx.post.canonical_url) + page.wait_for_timeout(500) + except Exception as err: # noqa: BLE001 + print(f" [hackernoon] canonical setup failed (non-fatal): {err}", file=sys.stderr) + + if ctx.validate_only: + shot = _save_screenshot(page, ctx.post.slug, "hackernoon-editor") + return {"validated": True, "screenshot": str(shot)} + + page.locator(self.SAVE_DRAFT_SELECTORS[0]).first.click() + page.wait_for_timeout(5000) + return { + "url": page.url, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + +class DZoneAdapter: + """DZone — AngularJS login form gated by invisible reCAPTCHA, body editor + is Froala. Uses storage-state auth (DZONE_STORAGE_STATE) since password + login can't pass reCAPTCHA from headless Playwright. + + Live editor URL: /content/article/post.html (the create-form). The Save + Draft button is enabled once title + body have content. + """ + + name = "dzone" + EDITOR_URL = "https://dzone.com/content/article/post.html" + + TITLE_SELECTORS = ["textarea[name='title'][placeholder='Enter Title Here']"] + SUBTITLE_SELECTORS = ["textarea[name='subtitle']"] + META_DESCRIPTION_SELECTORS = ["#meta-description-textarea"] + # Froala renders the editable area as a contenteditable div with class + # `fr-element`. Clicking puts the cursor in the body so keystrokes land. + BODY_FROALA_SELECTORS = ["div.fr-element[contenteditable='true']"] + SAVE_DRAFT_SELECTORS = ["button:has-text('Save draft')"] + + @staticmethod + def is_configured() -> bool: + return bool(os.environ.get("DZONE_STORAGE_STATE")) + + def login(self, page) -> None: + # Storage state is loaded into the browser context up-front; nothing + # to do here. If the cookies have expired the editor page will bounce + # back to login — at which point the user needs to refresh + # DZONE_STORAGE_STATE via export_storage_state.py. + return + + def submit_draft(self, page, ctx: AdapterContext) -> dict[str, Any]: + page.goto(self.EDITOR_URL, wait_until="domcontentloaded", timeout=60000) + page.wait_for_timeout(5000) + + title = page.locator(self.TITLE_SELECTORS[0]) + title.wait_for(state="visible", timeout=20000) + title.click() + title.press_sequentially(ctx.post.title, delay=5) + + # Subtitle / TL;DR — use the post's description if present. + description = str(ctx.post.front_matter.get("description") or "").strip() + if description: + try: + sub = page.locator(self.SUBTITLE_SELECTORS[0]) + sub.wait_for(state="visible", timeout=5000) + sub.click() + sub.press_sequentially(description[:300], delay=5) + except Exception: # noqa: BLE001 + pass + try: + meta = page.locator(self.META_DESCRIPTION_SELECTORS[0]) + meta.wait_for(state="visible", timeout=5000) + meta.click() + meta.press_sequentially(description[:155], delay=5) + except Exception: # noqa: BLE001 + pass + + # Body — Froala rich-text editor. Use Froala's JS API to set HTML + # directly; clipboard paste into the contenteditable is unreliable + # (Froala's paste handler often discards or transforms the input). + body_with_canonical = ( + f"

Originally published at " + f"{ctx.post.canonical_url}

\n\n" + f"
{_escape_html(ctx.body_markdown)}
" + ) + body = page.locator(self.BODY_FROALA_SELECTORS[0]).first + body.wait_for(state="visible", timeout=15000) + body.click() + result = page.evaluate( + """(html) => { + const fr = document.querySelector("div.fr-element[contenteditable='true']"); + if (window.FroalaEditor && window.FroalaEditor.INSTANCES && window.FroalaEditor.INSTANCES.length) { + const inst = window.FroalaEditor.INSTANCES[0]; + inst.html.set(html); + if (inst.events && inst.events.trigger) inst.events.trigger('contentChanged'); + return {via: 'froala-api', length: inst.html.get().length}; + } + if (fr) { fr.innerHTML = html; fr.dispatchEvent(new Event('input', {bubbles: true})); return {via: 'fallback', length: fr.innerHTML.length}; } + return {via: 'none'}; + }""", + body_with_canonical, + ) + if result.get("via") == "none": + raise AdapterError("could not access Froala editor instance") + page.wait_for_timeout(2000) + + if ctx.validate_only: + shot = _save_screenshot(page, ctx.post.slug, "dzone-editor") + return {"validated": True, "screenshot": str(shot)} + + page.locator(self.SAVE_DRAFT_SELECTORS[0]).first.click() + page.wait_for_timeout(5000) + return { + "url": page.url, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + +class MediumAdapter: + """Medium — no password login flow, so this adapter relies on a saved + Playwright storageState (cookies + localStorage) loaded from the + ``MEDIUM_STORAGE_STATE`` env var (base64-encoded JSON). + + To produce one: + + $ python3 -c "from playwright.sync_api import sync_playwright as p; \\ + ctx=p().start().chromium.launch(headless=False).new_context(); \\ + page=ctx.new_page(); page.goto('https://medium.com/m/signin'); \\ + input('Log in then press Enter...'); \\ + ctx.storage_state(path='medium-state.json')" + $ base64 -i medium-state.json | pbcopy # paste as MEDIUM_STORAGE_STATE + """ + + name = "medium" + EDITOR_URL = "https://medium.com/new-story" + + # Medium's editor is a single contenteditable div with placeholder + # "Title\nTell your story…" — first line typed becomes the H3 title, + # everything after is body. Auto-saves a few seconds after typing pauses. + EDITOR_SELECTOR = "div.postArticle-content[contenteditable='true']" + SETTINGS_BUTTON_SELECTORS = [ + "button:has-text('Story settings')", + "button[aria-label*='settings' i]", + ] + CANONICAL_FIELD_SELECTORS = [ + "input[placeholder*='canonical' i]", + "input[placeholder*='URL of original' i]", + ] + + @staticmethod + def is_configured() -> bool: + return bool(os.environ.get("MEDIUM_STORAGE_STATE")) + + @staticmethod + def storage_state_path() -> Path: + return _load_base64_storage_state("MEDIUM_STORAGE_STATE") + + def login(self, page) -> None: + # No-op: storage state was loaded into the browser context already. + return + + def submit_draft(self, page, ctx: AdapterContext) -> dict[str, Any]: + page.goto(self.EDITOR_URL, wait_until="domcontentloaded", timeout=45000) + page.wait_for_timeout(5000) + editor = page.locator(self.EDITOR_SELECTOR).first + editor.wait_for(state="visible", timeout=20000) + editor.click() + # Type the title, press Enter to drop into the body section, then + # paste the body. Medium auto-saves a few seconds after typing pauses. + page.keyboard.type(ctx.post.title, delay=5) + page.keyboard.press("Enter") + body_with_canonical = ( + f"Originally published at {ctx.post.canonical_url}\n\n" + + ctx.body_markdown + ) + page.evaluate("text => navigator.clipboard.writeText(text)", body_with_canonical) + page.keyboard.press("Meta+V" if sys.platform == "darwin" else "Control+V") + + if ctx.validate_only: + shot = _save_screenshot(page, ctx.post.slug, "medium-editor") + return {"validated": True, "screenshot": str(shot)} + + # Wait for Medium's auto-save to kick in. Medium redirects from + # /new-story to /p//edit once the first save completes. + try: + page.wait_for_url("**/p/*/edit", timeout=45000) + except Exception: # noqa: BLE001 + page.wait_for_timeout(8000) # fall back to a long wait + + return { + "url": page.url, + "syndicated_at": dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds"), + } + + +ADAPTERS: dict[str, Callable[[], Any]] = { + "foojay": FoojayAdapter, + "hackernoon": HackerNoonAdapter, + "dzone": DZoneAdapter, + "medium": MediumAdapter, +} + + +# --------------------------------------------------------------------------- # +# Driver # +# --------------------------------------------------------------------------- # + + +def run_adapter(adapter, post: Post, body_markdown: str, headed: bool, validate_only: bool) -> dict[str, Any]: + from playwright.sync_api import sync_playwright + + with sync_playwright() as pw: + launch_kwargs: dict[str, Any] = {"headless": not headed} + browser = pw.chromium.launch(**launch_kwargs) + context_kwargs: dict[str, Any] = { + "viewport": {"width": 1400, "height": 900}, + "user_agent": _UA_STR, + } + if isinstance(adapter, MediumAdapter): + context_kwargs["storage_state"] = str(MediumAdapter.storage_state_path()) + elif isinstance(adapter, DZoneAdapter): + context_kwargs["storage_state"] = str(_load_base64_storage_state("DZONE_STORAGE_STATE")) + + context = browser.new_context(**context_kwargs) + # Grant clipboard access so navigator.clipboard.writeText() succeeds. + try: + context.grant_permissions(["clipboard-read", "clipboard-write"]) + except Exception: # noqa: BLE001 — Firefox/WebKit don't support all permissions + pass + + page = context.new_page() + ctx = AdapterContext(post=post, body_markdown=body_markdown, headed=headed, validate_only=validate_only) + + try: + adapter.login(page) + result = adapter.submit_draft(page, ctx) + except Exception as err: # noqa: BLE001 + shot = _save_screenshot(page, post.slug, f"{adapter.name}-error") + raise AdapterError(f"{adapter.name} flow failed (screenshot: {shot}): {err}") from err + finally: + context.close() + browser.close() + return result + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--platforms", default=DEFAULT_PLATFORMS, + help=f"Comma-separated platforms (default: {DEFAULT_PLATFORMS}).") + parser.add_argument("--dry-run", action="store_true", + help="No browser launched; just print what would happen.") + parser.add_argument("--headed", action="store_true", + help="Run with a visible browser (for local debugging).") + parser.add_argument("--validate-only", action="store_true", + help="Log in and open the editor, then screenshot and exit without submitting.") + parser.add_argument("--today", default=None, help="Override today's date (YYYY-MM-DD).") + parser.add_argument("--floor", default=ELIGIBILITY_FLOOR.isoformat(), + help=f"Posts must be dated strictly after this date (default: {ELIGIBILITY_FLOOR.isoformat()}).") + parser.add_argument("--min-age-days", type=int, default=MIN_AGE_DAYS, + help=f"Minimum post age in days (default: {MIN_AGE_DAYS}).") + parser.add_argument("--blog-dir", default=str(BLOG_DIR)) + parser.add_argument("--state-file", default=str(STATE_FILE)) + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + today = dt.date.fromisoformat(args.today) if args.today else dt.date.today() + floor = dt.date.fromisoformat(args.floor) + requested = [p.strip() for p in args.platforms.split(",") if p.strip()] + blog_dir = Path(args.blog_dir) + state_file = Path(args.state_file) + + unknown = [p for p in requested if p not in ADAPTERS] + if unknown: + print(f"Unknown platform(s): {unknown}. Known: {sorted(ADAPTERS)}", file=sys.stderr) + return 1 + + adapters: list[Any] = [] + for name in requested: + adapter = ADAPTERS[name]() + if args.dry_run or args.validate_only or adapter.is_configured(): + adapters.append(adapter) + else: + print(f"[{name}] credentials not configured; skipping platform.") + + if not adapters: + print("No browser platforms are configured; nothing to do.") + return 0 + + posts = discover_posts(blog_dir) + state = State.load(state_file) + platform_names = [a.name for a in adapters] + candidate = select_candidate(posts, state, platform_names, today, floor, args.min_age_days) + if candidate is None and not args.validate_only: + print("No syndication candidate found today.") + return 0 + if candidate is None and args.validate_only: + # In validate-only mode, fall back to the newest post so we can still + # verify selectors even when nothing is technically due. + candidate = posts[-1] + print(f"validate-only: using newest post {candidate.slug} for selector verification.") + + print(f"Selected post: {candidate.slug} (date={candidate.date.isoformat()})") + body_markdown = render_syndicated_body(candidate) + + any_change = False + failures: list[str] = [] + + for adapter in adapters: + if state.is_syndicated(candidate.slug, adapter.name) and not args.validate_only: + print(f" [{adapter.name}] already syndicated; skipping.") + continue + if args.dry_run: + print(f" [{adapter.name}] dry run — would publish {len(body_markdown)} chars, " + f"canonical {candidate.canonical_url}") + continue + try: + result = run_adapter(adapter, candidate, body_markdown, args.headed, args.validate_only) + except Exception as err: # noqa: BLE001 + print(f" [{adapter.name}] FAILED: {err}", file=sys.stderr) + failures.append(adapter.name) + continue + + if args.validate_only: + print(f" [{adapter.name}] validated. {json.dumps(result)}") + continue + + if not result.get("url"): + print(f" [{adapter.name}] response missing URL: {result}", file=sys.stderr) + failures.append(adapter.name) + continue + + state.record(candidate.slug, adapter.name, result) + any_change = True + print(f" [{adapter.name}] published draft: {result['url']}") + + if any_change: + state.save(state_file) + print(f"Updated state file: {state_file}") + + if failures: + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/scripts/website/syndication-state.json b/scripts/website/syndication-state.json new file mode 100644 index 0000000000..ac96b53ff4 --- /dev/null +++ b/scripts/website/syndication-state.json @@ -0,0 +1,17 @@ +{ + "_comment": "Tracks blog posts syndicated by scripts/website/syndicate_blog_posts.py and syndicate_browser_posts.py. Keyed by post slug. Each platform sub-object records the remote URL/id and ISO timestamp once syndication succeeds.", + "posts": { + "liquid-glass-material-3-modern-native-themes": { + "devto": { + "id": 3620800, + "url": "https://dev.to/codenameone/liquid-glass-material-3-and-a-lot-of-plumbing-2jkk", + "syndicated_at": "2026-05-06T13:08:18+00:00" + }, + "hashnode": { + "id": "69fb2f0263ebe40f84df66db", + "url": "https://debugagent.com/liquid-glass-material-3-and-a-lot-of-plumbing", + "syndicated_at": "2026-05-06T12:24:00+00:00" + } + } + } +}