Skip to content

Commit dc3df85

Browse files
committed
Add automatic social and structured SEO tags
1 parent 48514db commit dc3df85

1 file changed

Lines changed: 118 additions & 2 deletions

File tree

scripts/seo_postprocess.py

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import html
3+
import json
34
import re
45
from datetime import datetime, timezone
56
from pathlib import Path
@@ -71,6 +72,10 @@ def canonical_url(site_url, lang, rel_path):
7172
return f"{site_url.rstrip('/')}/{lang}/{rel_path.as_posix()}"
7273

7374

75+
def asset_url(site_url, lang, asset_path):
76+
return f"{site_url.rstrip('/')}/{lang}/{asset_path.lstrip('/')}"
77+
78+
7479
def clean_text(fragment):
7580
fragment = re.sub(r"<script\b[^>]*>.*?</script>", " ", fragment, flags=re.I | re.S)
7681
fragment = re.sub(r"<style\b[^>]*>.*?</style>", " ", fragment, flags=re.I | re.S)
@@ -104,8 +109,100 @@ def extract_description(document, fallback):
104109
return trim_description(clean_text(scope), fallback)
105110

106111

107-
def build_seo_block(site_url, lang, rel_path, languages, default_lang):
112+
def strip_index_suffix(path):
113+
return re.sub(r"(?:^|/)index\.html$", "", path.as_posix())
114+
115+
116+
def is_homepage(rel_path):
117+
return rel_path.as_posix() == "index.html"
118+
119+
120+
def humanize_slug(value):
121+
value = value.replace(".html", "").replace("-", " ").replace("_", " ").strip()
122+
value = re.sub(r"\s+", " ", value)
123+
return value.title() if value else "Home"
124+
125+
126+
def breadcrumb_items(site_url, lang, rel_path):
127+
items = [{"name": "Home", "url": canonical_url(site_url, lang, Path("index.html"))}]
128+
bare_path = strip_index_suffix(rel_path)
129+
if not bare_path:
130+
return items
131+
132+
parts = [part for part in bare_path.split("/") if part]
133+
for idx in range(len(parts)):
134+
crumb_rel = Path(*parts[: idx + 1], "index.html")
135+
items.append({"name": humanize_slug(parts[idx]), "url": canonical_url(site_url, lang, crumb_rel)})
136+
return items
137+
138+
139+
def build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url):
140+
current_url = canonical_url(site_url, lang, rel_path)
141+
site_root = site_url.rstrip("/")
142+
website_url = canonical_url(site_url, "en", Path("index.html"))
143+
data = [
144+
{
145+
"@context": "https://schema.org",
146+
"@type": "Organization",
147+
"@id": f"{site_root}/#organization",
148+
"name": site_name,
149+
"url": site_root,
150+
"logo": {"@type": "ImageObject", "url": image_url},
151+
},
152+
{
153+
"@context": "https://schema.org",
154+
"@type": "WebSite",
155+
"@id": f"{site_root}/#website",
156+
"url": site_root,
157+
"name": site_name,
158+
"inLanguage": "en",
159+
"publisher": {"@id": f"{site_root}/#organization"},
160+
},
161+
{
162+
"@context": "https://schema.org",
163+
"@type": "WebPage",
164+
"@id": f"{current_url}#webpage",
165+
"url": current_url,
166+
"name": title,
167+
"description": description,
168+
"inLanguage": lang,
169+
"isPartOf": {"@id": f"{site_root}/#website"},
170+
"about": {"@id": f"{site_root}/#organization"},
171+
"primaryImageOfPage": {"@type": "ImageObject", "url": image_url},
172+
},
173+
{
174+
"@context": "https://schema.org",
175+
"@type": "BreadcrumbList",
176+
"itemListElement": [
177+
{
178+
"@type": "ListItem",
179+
"position": index,
180+
"name": item["name"],
181+
"item": item["url"],
182+
}
183+
for index, item in enumerate(breadcrumb_items(site_url, lang, rel_path), start=1)
184+
],
185+
},
186+
]
187+
188+
if is_homepage(rel_path):
189+
data[1]["potentialAction"] = {
190+
"@type": "SearchAction",
191+
"target": f"{website_url}?search={{search_term_string}}",
192+
"query-input": "required name=search_term_string",
193+
}
194+
195+
return data
196+
197+
198+
def build_seo_block(site_url, lang, rel_path, languages, default_lang, title, description, site_name):
108199
current_url = canonical_url(site_url, lang, rel_path)
200+
image_url = asset_url(site_url, default_lang, "favicon.png")
201+
structured_data = json.dumps(
202+
build_structured_data(site_url, lang, rel_path, title, description, site_name, image_url),
203+
ensure_ascii=False,
204+
separators=(",", ":"),
205+
)
109206
lines = [SEO_START, f'<link rel="canonical" href="{html.escape(current_url, quote=True)}">']
110207

111208
for alt_lang in languages:
@@ -116,6 +213,23 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang):
116213

117214
default_url = canonical_url(site_url, default_lang, rel_path)
118215
lines.append(f'<link rel="alternate" hreflang="x-default" href="{html.escape(default_url, quote=True)}">')
216+
lines.extend(
217+
[
218+
f'<meta property="og:site_name" content="{html.escape(site_name, quote=True)}">',
219+
f'<meta property="og:title" content="{html.escape(title, quote=True)}">',
220+
f'<meta property="og:description" content="{html.escape(description, quote=True)}">',
221+
f'<meta property="og:url" content="{html.escape(current_url, quote=True)}">',
222+
f'<meta property="og:type" content="website">',
223+
f'<meta property="og:image" content="{html.escape(image_url, quote=True)}">',
224+
f'<meta property="og:image:alt" content="{html.escape(site_name, quote=True)}">',
225+
f'<meta property="og:locale" content="{html.escape(lang, quote=True)}">',
226+
f'<meta name="twitter:card" content="summary">',
227+
f'<meta name="twitter:title" content="{html.escape(title, quote=True)}">',
228+
f'<meta name="twitter:description" content="{html.escape(description, quote=True)}">',
229+
f'<meta name="twitter:image" content="{html.escape(image_url, quote=True)}">',
230+
'<script type="application/ld+json">' + structured_data + "</script>",
231+
]
232+
)
119233
lines.append(SEO_END)
120234
return "\n ".join(lines)
121235

@@ -125,7 +239,9 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
125239
page_title = clean_text(title_match.group(1)) if title_match else site_name
126240
fallback_description = f"{site_name}: {page_title}"
127241
description = extract_description(document, fallback_description)
128-
seo_block = build_seo_block(site_url, lang, rel_path, languages, default_lang)
242+
seo_block = build_seo_block(
243+
site_url, lang, rel_path, languages, default_lang, page_title, description, site_name
244+
)
129245

130246
document = re.sub(
131247
r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*",

0 commit comments

Comments
 (0)