Skip to content

Commit a55b224

Browse files
committed
add datasets
1 parent cb2eb00 commit a55b224

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+3438
-17
lines changed

apps/sim/blocks/blocks/brightdata.ts

Lines changed: 203 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,54 @@ import type { BlockConfig } from '@/blocks/types'
33
import { AuthMode } from '@/blocks/types'
44
import type { BrightDataResponse } from '@/tools/brightdata/types'
55

6+
const DATASET_TOOL_MAP: Record<string, string> = {
7+
dataset_amazon_product: 'brightdata_dataset_amazon_product',
8+
dataset_amazon_product_reviews: 'brightdata_dataset_amazon_product_reviews',
9+
dataset_amazon_product_search: 'brightdata_dataset_amazon_product_search',
10+
dataset_walmart_product: 'brightdata_dataset_walmart_product',
11+
dataset_walmart_seller: 'brightdata_dataset_walmart_seller',
12+
dataset_ebay_product: 'brightdata_dataset_ebay_product',
13+
dataset_homedepot_products: 'brightdata_dataset_homedepot_products',
14+
dataset_zara_products: 'brightdata_dataset_zara_products',
15+
dataset_etsy_products: 'brightdata_dataset_etsy_products',
16+
dataset_bestbuy_products: 'brightdata_dataset_bestbuy_products',
17+
dataset_linkedin_person_profile: 'brightdata_dataset_linkedin_person_profile',
18+
dataset_linkedin_company_profile: 'brightdata_dataset_linkedin_company_profile',
19+
dataset_linkedin_job_listings: 'brightdata_dataset_linkedin_job_listings',
20+
dataset_linkedin_posts: 'brightdata_dataset_linkedin_posts',
21+
dataset_linkedin_people_search: 'brightdata_dataset_linkedin_people_search',
22+
dataset_crunchbase_company: 'brightdata_dataset_crunchbase_company',
23+
dataset_zoominfo_company_profile: 'brightdata_dataset_zoominfo_company_profile',
24+
dataset_instagram_profiles: 'brightdata_dataset_instagram_profiles',
25+
dataset_instagram_posts: 'brightdata_dataset_instagram_posts',
26+
dataset_instagram_reels: 'brightdata_dataset_instagram_reels',
27+
dataset_instagram_comments: 'brightdata_dataset_instagram_comments',
28+
dataset_facebook_posts: 'brightdata_dataset_facebook_posts',
29+
dataset_facebook_marketplace_listings: 'brightdata_dataset_facebook_marketplace_listings',
30+
dataset_facebook_company_reviews: 'brightdata_dataset_facebook_company_reviews',
31+
dataset_facebook_events: 'brightdata_dataset_facebook_events',
32+
dataset_tiktok_profiles: 'brightdata_dataset_tiktok_profiles',
33+
dataset_tiktok_posts: 'brightdata_dataset_tiktok_posts',
34+
dataset_tiktok_shop: 'brightdata_dataset_tiktok_shop',
35+
dataset_tiktok_comments: 'brightdata_dataset_tiktok_comments',
36+
dataset_google_maps_reviews: 'brightdata_dataset_google_maps_reviews',
37+
dataset_google_shopping: 'brightdata_dataset_google_shopping',
38+
dataset_google_play_store: 'brightdata_dataset_google_play_store',
39+
dataset_apple_app_store: 'brightdata_dataset_apple_app_store',
40+
dataset_reuter_news: 'brightdata_dataset_reuter_news',
41+
dataset_github_repository_file: 'brightdata_dataset_github_repository_file',
42+
dataset_yahoo_finance_business: 'brightdata_dataset_yahoo_finance_business',
43+
dataset_x_posts: 'brightdata_dataset_x_posts',
44+
dataset_zillow_properties_listing: 'brightdata_dataset_zillow_properties_listing',
45+
dataset_booking_hotel_listings: 'brightdata_dataset_booking_hotel_listings',
46+
dataset_youtube_profiles: 'brightdata_dataset_youtube_profiles',
47+
dataset_youtube_comments: 'brightdata_dataset_youtube_comments',
48+
dataset_reddit_posts: 'brightdata_dataset_reddit_posts',
49+
dataset_youtube_videos: 'brightdata_dataset_youtube_videos',
50+
dataset_npm_package: 'brightdata_dataset_npm_package',
51+
dataset_pypi_package: 'brightdata_dataset_pypi_package',
52+
}
53+
654
export const BrightDataBlock: BlockConfig<BrightDataResponse> = {
755
type: 'brightdata',
856
name: 'Bright Data',
@@ -25,6 +73,50 @@ export const BrightDataBlock: BlockConfig<BrightDataResponse> = {
2573
{ label: 'Scrape as Markdown', id: 'scrape_markdown' },
2674
{ label: 'Search Engine', id: 'search_engine' },
2775
{ label: 'Amazon Product Dataset', id: 'dataset_amazon_product' },
76+
{ label: 'Amazon Product Reviews Dataset', id: 'dataset_amazon_product_reviews' },
77+
{ label: 'Amazon Product Search Dataset', id: 'dataset_amazon_product_search' },
78+
{ label: 'Walmart Product Dataset', id: 'dataset_walmart_product' },
79+
{ label: 'Walmart Seller Dataset', id: 'dataset_walmart_seller' },
80+
{ label: 'Ebay Product Dataset', id: 'dataset_ebay_product' },
81+
{ label: 'Homedepot Products Dataset', id: 'dataset_homedepot_products' },
82+
{ label: 'Zara Products Dataset', id: 'dataset_zara_products' },
83+
{ label: 'Etsy Products Dataset', id: 'dataset_etsy_products' },
84+
{ label: 'Bestbuy Products Dataset', id: 'dataset_bestbuy_products' },
85+
{ label: 'Linkedin Person Profile Dataset', id: 'dataset_linkedin_person_profile' },
86+
{ label: 'Linkedin Company Profile Dataset', id: 'dataset_linkedin_company_profile' },
87+
{ label: 'Linkedin Job Listings Dataset', id: 'dataset_linkedin_job_listings' },
88+
{ label: 'Linkedin Posts Dataset', id: 'dataset_linkedin_posts' },
89+
{ label: 'Linkedin People Search Dataset', id: 'dataset_linkedin_people_search' },
90+
{ label: 'Crunchbase Company Dataset', id: 'dataset_crunchbase_company' },
91+
{ label: 'Zoominfo Company Profile Dataset', id: 'dataset_zoominfo_company_profile' },
92+
{ label: 'Instagram Profiles Dataset', id: 'dataset_instagram_profiles' },
93+
{ label: 'Instagram Posts Dataset', id: 'dataset_instagram_posts' },
94+
{ label: 'Instagram Reels Dataset', id: 'dataset_instagram_reels' },
95+
{ label: 'Instagram Comments Dataset', id: 'dataset_instagram_comments' },
96+
{ label: 'Facebook Posts Dataset', id: 'dataset_facebook_posts' },
97+
{ label: 'Facebook Marketplace Listings Dataset', id: 'dataset_facebook_marketplace_listings' },
98+
{ label: 'Facebook Company Reviews Dataset', id: 'dataset_facebook_company_reviews' },
99+
{ label: 'Facebook Events Dataset', id: 'dataset_facebook_events' },
100+
{ label: 'Tiktok Profiles Dataset', id: 'dataset_tiktok_profiles' },
101+
{ label: 'Tiktok Posts Dataset', id: 'dataset_tiktok_posts' },
102+
{ label: 'Tiktok Shop Dataset', id: 'dataset_tiktok_shop' },
103+
{ label: 'Tiktok Comments Dataset', id: 'dataset_tiktok_comments' },
104+
{ label: 'Google Maps Reviews Dataset', id: 'dataset_google_maps_reviews' },
105+
{ label: 'Google Shopping Dataset', id: 'dataset_google_shopping' },
106+
{ label: 'Google Play Store Dataset', id: 'dataset_google_play_store' },
107+
{ label: 'Apple App Store Dataset', id: 'dataset_apple_app_store' },
108+
{ label: 'Reuter News Dataset', id: 'dataset_reuter_news' },
109+
{ label: 'Github Repository File Dataset', id: 'dataset_github_repository_file' },
110+
{ label: 'Yahoo Finance Business Dataset', id: 'dataset_yahoo_finance_business' },
111+
{ label: 'X Posts Dataset', id: 'dataset_x_posts' },
112+
{ label: 'Zillow Properties Listing Dataset', id: 'dataset_zillow_properties_listing' },
113+
{ label: 'Booking Hotel Listings Dataset', id: 'dataset_booking_hotel_listings' },
114+
{ label: 'Youtube Profiles Dataset', id: 'dataset_youtube_profiles' },
115+
{ label: 'Youtube Comments Dataset', id: 'dataset_youtube_comments' },
116+
{ label: 'Reddit Posts Dataset', id: 'dataset_reddit_posts' },
117+
{ label: 'Youtube Videos Dataset', id: 'dataset_youtube_videos' },
118+
{ label: 'Npm Package Dataset', id: 'dataset_npm_package' },
119+
{ label: 'Pypi Package Dataset', id: 'dataset_pypi_package' },
28120
],
29121
value: () => 'scrape_markdown',
30122
},
@@ -53,10 +145,64 @@ export const BrightDataBlock: BlockConfig<BrightDataResponse> = {
53145
},
54146
{
55147
id: 'url',
56-
title: 'Amazon Product URL',
148+
title: 'Dataset URL',
149+
type: 'short-input',
150+
placeholder: 'https://example.com',
151+
condition: { field: 'operation', value: ['dataset_amazon_product', 'dataset_amazon_product_reviews', 'dataset_amazon_product_search', 'dataset_walmart_product', 'dataset_walmart_seller', 'dataset_ebay_product', 'dataset_homedepot_products', 'dataset_zara_products', 'dataset_etsy_products', 'dataset_bestbuy_products', 'dataset_linkedin_person_profile', 'dataset_linkedin_company_profile', 'dataset_linkedin_job_listings', 'dataset_linkedin_posts', 'dataset_linkedin_people_search', 'dataset_crunchbase_company', 'dataset_zoominfo_company_profile', 'dataset_instagram_profiles', 'dataset_instagram_posts', 'dataset_instagram_reels', 'dataset_instagram_comments', 'dataset_facebook_posts', 'dataset_facebook_marketplace_listings', 'dataset_facebook_company_reviews', 'dataset_facebook_events', 'dataset_tiktok_profiles', 'dataset_tiktok_posts', 'dataset_tiktok_shop', 'dataset_tiktok_comments', 'dataset_google_maps_reviews', 'dataset_google_shopping', 'dataset_google_play_store', 'dataset_apple_app_store', 'dataset_reuter_news', 'dataset_github_repository_file', 'dataset_yahoo_finance_business', 'dataset_x_posts', 'dataset_zillow_properties_listing', 'dataset_booking_hotel_listings', 'dataset_youtube_profiles', 'dataset_youtube_comments', 'dataset_reddit_posts', 'dataset_youtube_videos'] },
152+
required: true,
153+
},
154+
{
155+
id: 'keyword',
156+
title: 'Keyword',
157+
type: 'short-input',
158+
placeholder: 'Enter keyword',
159+
condition: { field: 'operation', value: ['dataset_amazon_product_search'] },
160+
required: true,
161+
},
162+
{
163+
id: 'first_name',
164+
title: 'First Name',
165+
type: 'short-input',
166+
placeholder: 'First name',
167+
condition: { field: 'operation', value: ['dataset_linkedin_people_search'] },
168+
required: true,
169+
},
170+
{
171+
id: 'last_name',
172+
title: 'Last Name',
173+
type: 'short-input',
174+
placeholder: 'Last name',
175+
condition: { field: 'operation', value: ['dataset_linkedin_people_search'] },
176+
required: true,
177+
},
178+
{
179+
id: 'num_of_reviews',
180+
title: 'Number of Reviews',
181+
type: 'short-input',
182+
placeholder: '10',
183+
condition: { field: 'operation', value: ['dataset_facebook_company_reviews'] },
184+
required: true,
185+
},
186+
{
187+
id: 'days_limit',
188+
title: 'Days Limit',
189+
type: 'short-input',
190+
placeholder: '3',
191+
condition: { field: 'operation', value: ['dataset_google_maps_reviews'] },
192+
},
193+
{
194+
id: 'num_of_comments',
195+
title: 'Number of Comments',
196+
type: 'short-input',
197+
placeholder: '10',
198+
condition: { field: 'operation', value: ['dataset_youtube_comments'] },
199+
},
200+
{
201+
id: 'package_name',
202+
title: 'Package Name',
57203
type: 'short-input',
58-
placeholder: 'https://www.amazon.com/dp/...',
59-
condition: { field: 'operation', value: 'dataset_amazon_product' },
204+
placeholder: '@brightdata/sdk',
205+
condition: { field: 'operation', value: ['dataset_npm_package', 'dataset_pypi_package'] },
60206
required: true,
61207
},
62208
{
@@ -81,16 +227,60 @@ export const BrightDataBlock: BlockConfig<BrightDataResponse> = {
81227
'brightdata_scrape_markdown',
82228
'brightdata_search_engine',
83229
'brightdata_dataset_amazon_product',
230+
'brightdata_dataset_amazon_product_reviews',
231+
'brightdata_dataset_amazon_product_search',
232+
'brightdata_dataset_walmart_product',
233+
'brightdata_dataset_walmart_seller',
234+
'brightdata_dataset_ebay_product',
235+
'brightdata_dataset_homedepot_products',
236+
'brightdata_dataset_zara_products',
237+
'brightdata_dataset_etsy_products',
238+
'brightdata_dataset_bestbuy_products',
239+
'brightdata_dataset_linkedin_person_profile',
240+
'brightdata_dataset_linkedin_company_profile',
241+
'brightdata_dataset_linkedin_job_listings',
242+
'brightdata_dataset_linkedin_posts',
243+
'brightdata_dataset_linkedin_people_search',
244+
'brightdata_dataset_crunchbase_company',
245+
'brightdata_dataset_zoominfo_company_profile',
246+
'brightdata_dataset_instagram_profiles',
247+
'brightdata_dataset_instagram_posts',
248+
'brightdata_dataset_instagram_reels',
249+
'brightdata_dataset_instagram_comments',
250+
'brightdata_dataset_facebook_posts',
251+
'brightdata_dataset_facebook_marketplace_listings',
252+
'brightdata_dataset_facebook_company_reviews',
253+
'brightdata_dataset_facebook_events',
254+
'brightdata_dataset_tiktok_profiles',
255+
'brightdata_dataset_tiktok_posts',
256+
'brightdata_dataset_tiktok_shop',
257+
'brightdata_dataset_tiktok_comments',
258+
'brightdata_dataset_google_maps_reviews',
259+
'brightdata_dataset_google_shopping',
260+
'brightdata_dataset_google_play_store',
261+
'brightdata_dataset_apple_app_store',
262+
'brightdata_dataset_reuter_news',
263+
'brightdata_dataset_github_repository_file',
264+
'brightdata_dataset_yahoo_finance_business',
265+
'brightdata_dataset_x_posts',
266+
'brightdata_dataset_zillow_properties_listing',
267+
'brightdata_dataset_booking_hotel_listings',
268+
'brightdata_dataset_youtube_profiles',
269+
'brightdata_dataset_youtube_comments',
270+
'brightdata_dataset_reddit_posts',
271+
'brightdata_dataset_youtube_videos',
272+
'brightdata_dataset_npm_package',
273+
'brightdata_dataset_pypi_package',
84274
],
85275
config: {
86276
tool: (params: Record<string, unknown>) => {
277+
const datasetTool = DATASET_TOOL_MAP[String(params.operation)]
278+
if (datasetTool) return datasetTool
87279
switch (params.operation) {
88280
case 'scrape_markdown':
89281
return 'brightdata_scrape_markdown'
90282
case 'search_engine':
91283
return 'brightdata_search_engine'
92-
case 'dataset_amazon_product':
93-
return 'brightdata_dataset_amazon_product'
94284
default:
95285
throw new Error('Invalid operation selected')
96286
}
@@ -103,14 +293,21 @@ export const BrightDataBlock: BlockConfig<BrightDataResponse> = {
103293
url: { type: 'string', description: 'URL to scrape or dataset input' },
104294
query: { type: 'string', description: 'Search query' },
105295
maxResults: { type: 'number', description: 'Maximum search results' },
296+
keyword: { type: 'string', description: 'Dataset keyword input' },
297+
first_name: { type: 'string', description: 'Dataset first name input' },
298+
last_name: { type: 'string', description: 'Dataset last name input' },
299+
num_of_reviews: { type: 'string', description: 'Dataset number of reviews input' },
300+
days_limit: { type: 'string', description: 'Dataset days limit input' },
301+
num_of_comments: { type: 'string', description: 'Dataset number of comments input' },
302+
package_name: { type: 'string', description: 'Dataset package name input' },
106303
apiToken: { type: 'string', description: 'Bright Data API token' },
107304
unlockerZone: { type: 'string', description: 'Unlocker zone name' },
108305
},
109306

110307
outputs: {
111308
markdown: { type: 'string', description: 'Scraped markdown content' },
112309
results: { type: 'array', description: 'Search results' },
113-
data: { type: 'object', description: 'Dataset response' },
310+
data: { type: 'json', description: 'Dataset response' },
114311
url: { type: 'string', description: 'Current or scraped URL' },
115312
title: { type: 'string', description: 'Page title' },
116313
success: { type: 'boolean', description: 'Operation success status' },

apps/sim/tools/brightdata/dataset_amazon_product.ts

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,20 @@ import type { DatasetParams, DatasetResponse } from '@/tools/brightdata/types'
22
import type { ToolConfig } from '@/tools/types'
33

44
/**
5-
* Bright Data Amazon product dataset tool.
5+
* Bright Data Amazon Product dataset tool.
66
*/
77
export const datasetAmazonProductTool: ToolConfig<DatasetParams, DatasetResponse> = {
88
id: 'brightdata_dataset_amazon_product',
99
name: 'Bright Data Amazon Product Dataset',
10-
description: 'Get structured Amazon product data from Bright Data dataset',
10+
description: "Quickly read structured amazon product data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping",
1111
version: '1.0.0',
1212

1313
params: {
1414
url: {
1515
type: 'string',
1616
required: true,
1717
visibility: 'user-or-llm',
18-
description: 'Amazon product URL (must contain /dp/)',
18+
description: 'Dataset input URL',
1919
},
2020
apiToken: {
2121
type: 'string',
@@ -31,11 +31,15 @@ export const datasetAmazonProductTool: ToolConfig<DatasetParams, DatasetResponse
3131
headers: () => ({
3232
'Content-Type': 'application/json',
3333
}),
34-
body: (params) => ({
35-
datasetId: 'gd_l7q7dkf244hwjntr0',
36-
url: params.url,
37-
apiToken: params.apiToken,
38-
}),
34+
body: (params) => {
35+
const body: Record<string, unknown> = {
36+
datasetId: 'gd_l7q7dkf244hwjntr0',
37+
apiToken: params.apiToken,
38+
url: params.url,
39+
}
40+
41+
return body
42+
},
3943
},
4044

4145
transformResponse: async (response: Response) => {
@@ -54,7 +58,7 @@ export const datasetAmazonProductTool: ToolConfig<DatasetParams, DatasetResponse
5458
outputs: {
5559
data: {
5660
type: 'object',
57-
description: 'Structured Amazon product data',
61+
description: 'Structured dataset response',
5862
},
5963
snapshot_at: {
6064
type: 'string',
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import type { DatasetParams, DatasetResponse } from '@/tools/brightdata/types'
2+
import type { ToolConfig } from '@/tools/types'
3+
4+
/**
5+
* Bright Data Amazon Product Reviews dataset tool.
6+
*/
7+
export const datasetAmazonProductReviewsTool: ToolConfig<DatasetParams, DatasetResponse> = {
8+
id: 'brightdata_dataset_amazon_product_reviews',
9+
name: 'Bright Data Amazon Product Reviews Dataset',
10+
description: "Quickly read structured amazon product review data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping",
11+
version: '1.0.0',
12+
13+
params: {
14+
url: {
15+
type: 'string',
16+
required: true,
17+
visibility: 'user-or-llm',
18+
description: 'Dataset input URL',
19+
},
20+
apiToken: {
21+
type: 'string',
22+
required: true,
23+
visibility: 'user-only',
24+
description: 'Bright Data API token',
25+
},
26+
},
27+
28+
request: {
29+
method: 'POST',
30+
url: '/api/tools/brightdata/dataset',
31+
headers: () => ({
32+
'Content-Type': 'application/json',
33+
}),
34+
body: (params) => {
35+
const body: Record<string, unknown> = {
36+
datasetId: 'gd_le8e811kzy4ggddlq',
37+
apiToken: params.apiToken,
38+
url: params.url,
39+
}
40+
41+
return body
42+
},
43+
},
44+
45+
transformResponse: async (response: Response) => {
46+
const data = await response.json()
47+
48+
if (!response.ok) {
49+
throw new Error(data.error || 'Bright Data dataset fetch failed')
50+
}
51+
52+
return {
53+
success: true,
54+
output: data,
55+
}
56+
},
57+
58+
outputs: {
59+
data: {
60+
type: 'object',
61+
description: 'Structured dataset response',
62+
},
63+
snapshot_at: {
64+
type: 'string',
65+
description: 'Timestamp of data snapshot',
66+
optional: true,
67+
},
68+
},
69+
}

0 commit comments

Comments
 (0)