11import argparse
22import html
3+ import json
34import re
45from datetime import datetime , timezone
56from pathlib import Path
@@ -71,6 +72,10 @@ def canonical_url(site_url, lang, rel_path):
7172 return f"{ site_url .rstrip ('/' )} /{ lang } /{ rel_path .as_posix ()} "
7273
7374
75+ def asset_url (site_url , lang , asset_path ):
76+ return f"{ site_url .rstrip ('/' )} /{ lang } /{ asset_path .lstrip ('/' )} "
77+
78+
7479def clean_text (fragment ):
7580 fragment = re .sub (r"<script\b[^>]*>.*?</script>" , " " , fragment , flags = re .I | re .S )
7681 fragment = re .sub (r"<style\b[^>]*>.*?</style>" , " " , fragment , flags = re .I | re .S )
@@ -104,8 +109,100 @@ def extract_description(document, fallback):
104109 return trim_description (clean_text (scope ), fallback )
105110
106111
107- def build_seo_block (site_url , lang , rel_path , languages , default_lang ):
112+ def strip_index_suffix (path ):
113+ return re .sub (r"(?:^|/)index\.html$" , "" , path .as_posix ())
114+
115+
116+ def is_homepage (rel_path ):
117+ return rel_path .as_posix () == "index.html"
118+
119+
120+ def humanize_slug (value ):
121+ value = value .replace (".html" , "" ).replace ("-" , " " ).replace ("_" , " " ).strip ()
122+ value = re .sub (r"\s+" , " " , value )
123+ return value .title () if value else "Home"
124+
125+
126+ def breadcrumb_items (site_url , lang , rel_path ):
127+ items = [{"name" : "Home" , "url" : canonical_url (site_url , lang , Path ("index.html" ))}]
128+ bare_path = strip_index_suffix (rel_path )
129+ if not bare_path :
130+ return items
131+
132+ parts = [part for part in bare_path .split ("/" ) if part ]
133+ for idx in range (len (parts )):
134+ crumb_rel = Path (* parts [: idx + 1 ], "index.html" )
135+ items .append ({"name" : humanize_slug (parts [idx ]), "url" : canonical_url (site_url , lang , crumb_rel )})
136+ return items
137+
138+
139+ def build_structured_data (site_url , lang , rel_path , title , description , site_name , image_url ):
140+ current_url = canonical_url (site_url , lang , rel_path )
141+ site_root = site_url .rstrip ("/" )
142+ website_url = canonical_url (site_url , "en" , Path ("index.html" ))
143+ data = [
144+ {
145+ "@context" : "https://schema.org" ,
146+ "@type" : "Organization" ,
147+ "@id" : f"{ site_root } /#organization" ,
148+ "name" : site_name ,
149+ "url" : site_root ,
150+ "logo" : {"@type" : "ImageObject" , "url" : image_url },
151+ },
152+ {
153+ "@context" : "https://schema.org" ,
154+ "@type" : "WebSite" ,
155+ "@id" : f"{ site_root } /#website" ,
156+ "url" : site_root ,
157+ "name" : site_name ,
158+ "inLanguage" : "en" ,
159+ "publisher" : {"@id" : f"{ site_root } /#organization" },
160+ },
161+ {
162+ "@context" : "https://schema.org" ,
163+ "@type" : "WebPage" ,
164+ "@id" : f"{ current_url } #webpage" ,
165+ "url" : current_url ,
166+ "name" : title ,
167+ "description" : description ,
168+ "inLanguage" : lang ,
169+ "isPartOf" : {"@id" : f"{ site_root } /#website" },
170+ "about" : {"@id" : f"{ site_root } /#organization" },
171+ "primaryImageOfPage" : {"@type" : "ImageObject" , "url" : image_url },
172+ },
173+ {
174+ "@context" : "https://schema.org" ,
175+ "@type" : "BreadcrumbList" ,
176+ "itemListElement" : [
177+ {
178+ "@type" : "ListItem" ,
179+ "position" : index ,
180+ "name" : item ["name" ],
181+ "item" : item ["url" ],
182+ }
183+ for index , item in enumerate (breadcrumb_items (site_url , lang , rel_path ), start = 1 )
184+ ],
185+ },
186+ ]
187+
188+ if is_homepage (rel_path ):
189+ data [1 ]["potentialAction" ] = {
190+ "@type" : "SearchAction" ,
191+ "target" : f"{ website_url } ?search={{search_term_string}}" ,
192+ "query-input" : "required name=search_term_string" ,
193+ }
194+
195+ return data
196+
197+
198+ def build_seo_block (site_url , lang , rel_path , languages , default_lang , title , description , site_name ):
108199 current_url = canonical_url (site_url , lang , rel_path )
200+ image_url = asset_url (site_url , default_lang , "favicon.png" )
201+ structured_data = json .dumps (
202+ build_structured_data (site_url , lang , rel_path , title , description , site_name , image_url ),
203+ ensure_ascii = False ,
204+ separators = ("," , ":" ),
205+ )
109206 lines = [SEO_START , f'<link rel="canonical" href="{ html .escape (current_url , quote = True )} ">' ]
110207
111208 for alt_lang in languages :
@@ -116,6 +213,23 @@ def build_seo_block(site_url, lang, rel_path, languages, default_lang):
116213
117214 default_url = canonical_url (site_url , default_lang , rel_path )
118215 lines .append (f'<link rel="alternate" hreflang="x-default" href="{ html .escape (default_url , quote = True )} ">' )
216+ lines .extend (
217+ [
218+ f'<meta property="og:site_name" content="{ html .escape (site_name , quote = True )} ">' ,
219+ f'<meta property="og:title" content="{ html .escape (title , quote = True )} ">' ,
220+ f'<meta property="og:description" content="{ html .escape (description , quote = True )} ">' ,
221+ f'<meta property="og:url" content="{ html .escape (current_url , quote = True )} ">' ,
222+ f'<meta property="og:type" content="website">' ,
223+ f'<meta property="og:image" content="{ html .escape (image_url , quote = True )} ">' ,
224+ f'<meta property="og:image:alt" content="{ html .escape (site_name , quote = True )} ">' ,
225+ f'<meta property="og:locale" content="{ html .escape (lang , quote = True )} ">' ,
226+ f'<meta name="twitter:card" content="summary">' ,
227+ f'<meta name="twitter:title" content="{ html .escape (title , quote = True )} ">' ,
228+ f'<meta name="twitter:description" content="{ html .escape (description , quote = True )} ">' ,
229+ f'<meta name="twitter:image" content="{ html .escape (image_url , quote = True )} ">' ,
230+ '<script type="application/ld+json">' + structured_data + "</script>" ,
231+ ]
232+ )
119233 lines .append (SEO_END )
120234 return "\n " .join (lines )
121235
@@ -125,7 +239,9 @@ def update_document(document, site_url, lang, rel_path, languages, default_lang,
125239 page_title = clean_text (title_match .group (1 )) if title_match else site_name
126240 fallback_description = f"{ site_name } : { page_title } "
127241 description = extract_description (document , fallback_description )
128- seo_block = build_seo_block (site_url , lang , rel_path , languages , default_lang )
242+ seo_block = build_seo_block (
243+ site_url , lang , rel_path , languages , default_lang , page_title , description , site_name
244+ )
129245
130246 document = re .sub (
131247 r"\s*<!-- HT_SEO_START -->.*?<!-- HT_SEO_END -->\s*" ,
0 commit comments