Skip to content

Commit a40dc53

Browse files
committed
improvement(seo): restore explicit AI/search bot allow-list and add link-preview rules
1 parent 79ffccc commit a40dc53

1 file changed

Lines changed: 113 additions & 16 deletions

File tree

apps/sim/app/robots.ts

Lines changed: 113 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,126 @@
11
import type { MetadataRoute } from 'next'
22
import { getBaseUrl } from '@/lib/core/utils/urls'
33

4+
/**
5+
* Default disallow list applied to crawlers and the wildcard rule. Blocks
6+
* authenticated surfaces, internal endpoints, and one-time-use links.
7+
*/
8+
const DISALLOWED_PATHS = [
9+
'/api/',
10+
'/workspace/',
11+
'/chat/',
12+
'/playground/',
13+
'/resume/',
14+
'/invite/',
15+
'/unsubscribe/',
16+
'/w/',
17+
'/form/',
18+
'/credential-account/',
19+
'/_next/',
20+
'/private/',
21+
'/blog*tag=',
22+
]
23+
24+
/**
25+
* Tighter disallow list for link-preview bots. They fetch single URLs to
26+
* render Open Graph cards rather than crawl, so publicly-shareable surfaces
27+
* like /chat/, /form/, and /w/ must be reachable for previews to render.
28+
*/
29+
const LINK_PREVIEW_DISALLOWED_PATHS = [
30+
'/api/',
31+
'/workspace/',
32+
'/resume/',
33+
'/invite/',
34+
'/unsubscribe/',
35+
'/credential-account/',
36+
'/_next/',
37+
'/private/',
38+
]
39+
40+
/**
41+
* Search engines and AI/answer-engine crawlers explicitly allow-listed for
42+
* SEO/AEO/GEO. Explicit Allow rules ensure these bots are not accidentally
43+
* suppressed by downstream filters and signal intent to operators that
44+
* publish allow-list audits (Profound, Scrunch, Otterly, etc.).
45+
*/
46+
const SEARCH_AND_AI_BOTS = [
47+
// Western search engines
48+
'Googlebot',
49+
'Bingbot',
50+
'DuckDuckBot',
51+
'Kagibot',
52+
'Bravebot',
53+
// Regional search engines
54+
'YandexBot',
55+
'Baiduspider',
56+
'Sogou web spider',
57+
'Yeti',
58+
'SeznamBot',
59+
'PetalBot',
60+
// OpenAI
61+
'GPTBot',
62+
'OAI-SearchBot',
63+
'ChatGPT-User',
64+
// Anthropic
65+
'ClaudeBot',
66+
'Claude-SearchBot',
67+
'Claude-User',
68+
// Google AI
69+
'Google-Extended',
70+
// Perplexity
71+
'PerplexityBot',
72+
'Perplexity-User',
73+
// Apple
74+
'Applebot',
75+
'Applebot-Extended',
76+
// Meta
77+
'Meta-ExternalAgent',
78+
'Meta-ExternalFetcher',
79+
'FacebookBot',
80+
// Other major AI / answer engines
81+
'Amazonbot',
82+
'CCBot',
83+
'cohere-ai',
84+
'cohere-training-data-crawler',
85+
'Grok-web-crawl',
86+
'MistralAI-User',
87+
'DeepSeek-AI',
88+
'YouBot',
89+
'Diffbot',
90+
'AI2Bot',
91+
'Timpibot',
92+
'ImageSiftBot',
93+
]
94+
95+
/**
96+
* Social and messaging platforms that fetch URLs to render link previews
97+
* (Open Graph / Twitter Card images). These need access to publicly-shared
98+
* surfaces like /chat/ and /form/ that are otherwise blocked.
99+
*/
100+
const LINK_PREVIEW_BOTS = [
101+
'Twitterbot',
102+
'LinkedInBot',
103+
'Slackbot',
104+
'Slack-ImgProxy',
105+
'Discordbot',
106+
'TelegramBot',
107+
'WhatsApp',
108+
'facebookexternalhit',
109+
'Pinterestbot',
110+
'redditbot',
111+
]
112+
4113
export default function robots(): MetadataRoute.Robots {
5114
const baseUrl = getBaseUrl()
6115

7116
return {
8117
rules: [
118+
{ userAgent: '*', allow: '/', disallow: DISALLOWED_PATHS },
119+
{ userAgent: SEARCH_AND_AI_BOTS, allow: '/', disallow: DISALLOWED_PATHS },
9120
{
10-
userAgent: '*',
121+
userAgent: LINK_PREVIEW_BOTS,
11122
allow: '/',
12-
disallow: [
13-
'/api/',
14-
'/workspace/',
15-
'/chat/',
16-
'/playground/',
17-
'/resume/',
18-
'/invite/',
19-
'/unsubscribe/',
20-
'/w/',
21-
'/form/',
22-
'/credential-account/',
23-
'/_next/',
24-
'/private/',
25-
'/blog*tag=',
26-
],
123+
disallow: LINK_PREVIEW_DISALLOWED_PATHS,
27124
},
28125
],
29126
sitemap: [`${baseUrl}/sitemap.xml`, `${baseUrl}/blog/sitemap-images.xml`],

0 commit comments

Comments
 (0)