Skip to content

Commit d06fe66

Browse files
Merge pull request #8 from AtlasAnalyticsLab/fix/sitemap-robots-2026-02-07
fix: update robots.txt and sitemap.xml for improved crawler management
2 parents 39cdce6 + 6bb86d5 commit d06fe66

File tree

3 files changed

+42
-33
lines changed

3 files changed

+42
-33
lines changed

DEVELOPMENT.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ If you need to change the website URL:
614614
615615
#### Step 1: Update Critical Configuration
616616
- [ ] Update `_config.yml` → `url:` field
617-
- [ ] Update `robots.txt` → `Sitemap:` line
617+
- [ ] Verify `robots.txt` → `Sitemap:` line (generated from `{{ site.url }}{{ site.baseurl }}`)
618618
- [ ] Update or remove `CNAME` file if using custom domain
619619
620620
#### Step 2: Test Locally

robots.txt

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,39 @@ permalink: /robots.txt
1313
# - Malicious crawlers may ignore this file
1414
# - For GitHub Pages, this provides basic protection
1515

16-
# Allow major search engines with rate limiting
16+
# Allow major search engines.
17+
# Note: Googlebot ignores Crawl-delay directives, so we omit it to avoid Search Console warnings.
1718
User-agent: Googlebot
18-
Crawl-delay: 10
19-
Allow: /
19+
Disallow: /images/
20+
Disallow: /assets/
21+
Disallow: /_site/
22+
Disallow: /bin/
23+
Disallow: /CNAME
24+
Disallow: /README.md
25+
Disallow: /DEVELOPMENT.md
26+
Disallow: /.htaccess
2027

2128
User-agent: Bingbot
2229
Crawl-delay: 10
23-
Allow: /
30+
Disallow: /images/
31+
Disallow: /assets/
32+
Disallow: /_site/
33+
Disallow: /bin/
34+
Disallow: /CNAME
35+
Disallow: /README.md
36+
Disallow: /DEVELOPMENT.md
37+
Disallow: /.htaccess
2438

2539
User-agent: Slurp
2640
Crawl-delay: 10
27-
Allow: /
41+
Disallow: /images/
42+
Disallow: /assets/
43+
Disallow: /_site/
44+
Disallow: /bin/
45+
Disallow: /CNAME
46+
Disallow: /README.md
47+
Disallow: /DEVELOPMENT.md
48+
Disallow: /.htaccess
2849

2950
# Block aggressive/problematic crawlers
3051
User-agent: MJ12bot
@@ -64,18 +85,14 @@ Crawl-delay: 10
6485
Disallow: /images/
6586
Disallow: /assets/
6687
Disallow: /_site/
88+
Disallow: /bin/
89+
Disallow: /CNAME
90+
Disallow: /README.md
91+
Disallow: /DEVELOPMENT.md
92+
Disallow: /.htaccess
6793

68-
# Allow access to main pages
69-
Allow: /$
70-
Allow: /allnews
71-
Allow: /allnews.html
72-
Allow: /team
73-
Allow: /publications
74-
Allow: /contact
75-
Allow: /funding
76-
Allow: /gallery
77-
Allow: /openings
78-
Allow: /sitemap.xml
94+
# Allow access to main pages (everything else is allowed by default)
95+
Allow: /
7996

8097
# Sitemap location (helps good crawlers index efficiently)
8198
Sitemap: {{ site.url }}{{ site.baseurl }}/sitemap.xml

sitemap.xml

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,17 @@ permalink: /sitemap.xml
44
---
55
<?xml version="1.0" encoding="UTF-8"?>
66
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
7-
{% for page in site.pages %}
8-
{% if page.url == nil %}
9-
{% continue %}
10-
{% endif %}
11-
12-
{% if page.exclude_from_sitemap == true %}
13-
{% continue %}
14-
{% endif %}
15-
16-
{% if page.url == "/404.html" or page.url == "/sitemap.xml" or page.url == "/robots.txt" %}
17-
{% continue %}
18-
{% endif %}
19-
20-
{% if page.url contains ".css" or page.url contains ".js" or page.url contains ".xml" or page.url contains ".txt" %}
21-
{% continue %}
22-
{% endif %}
7+
{% assign pages_list = site.pages | where_exp: "p", "p.url != nil" %}
8+
{% for page in pages_list %}
9+
{% if page.exclude_from_sitemap == true %}{% continue %}{% endif %}
10+
{% if page.url == "/404.html" or page.url == "/sitemap.xml" or page.url == "/robots.txt" %}{% continue %}{% endif %}
11+
{% if page.url contains ".css" or page.url contains ".js" or page.url contains ".xml" or page.url contains ".txt" %}{% continue %}{% endif %}
2312

2413
<url>
2514
<loc>{{ site.url }}{{ site.baseurl }}{{ page.url | replace: "index.html", "" }}</loc>
15+
{% if page.last_modified_at %}
16+
<lastmod>{{ page.last_modified_at | date_to_xmlschema }}</lastmod>
17+
{% endif %}
2618
</url>
2719
{% endfor %}
2820
</urlset>

0 commit comments

Comments
 (0)