-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcustom_scraper.js
More file actions
84 lines (77 loc) · 3.13 KB
/
custom_scraper.js
File metadata and controls
84 lines (77 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
const { CrawlingAPI } = require('crawlbase');
const cheerio = require('cheerio');
const fs = require('fs');
const NORMAL_TOKEN = '<Normal requests token>';
const URL = 'https://stackoverflow.com/questions/tagged/javascript';
// Initialize CrawlingAPI with the provided token
const api = new CrawlingAPI({ token: NORMAL_TOKEN }); // Replace it with your Crawlbase Token
// Make a request to the specified URL
api
.get(URL)
.then((response) => {
const parsedData = getParsedData(response.body);
const prettyJson = JSON.stringify(parsedData, null, 2)
console.log(prettyJson);
fs.writeFileSync('sample-custom-scraper-output.json', prettyJson);
})
// Handle errors if the request fails
.catch(console.error);
// Function to parse the HTML content and extract relevant information
function getParsedData(html) {
// Load HTML content with Cheerio
const $ = cheerio.load(html),
// Initialize an object to store parsed data
parsedData = {
title: '',
description: '',
totalQuestions: 0,
questions: [],
currentPage: 0,
};
// Extract main information about the page
parsedData['title'] = $('meta[name="twitter:title"]').attr('content').replace(/\s+/g, ' ').trim();
parsedData['description'] = $('meta[name="twitter:description"]').attr('content');
parsedData['totalQuestions'] = $('div[data-controller="se-uql"] .fs-body3').text().replace(/\s+/g, ' ').trim();
parsedData['currentPage'] = $('.s-pagination.float-left .s-pagination--item.is-selected')
.text()
.replace(/\s+/g, ' ')
.trim();
// Extract data for each question on the page
$('#questions .js-post-summary').each((_, element) => {
// Extract other properties for the question
const question = $(element).find('.s-post-summary--content-title').text().replace(/\s+/g, ' ').trim(),
authorName = $(element).find('.s-user-card--link').text().replace(/\s+/g, ' ').trim(),
link = $(element).find('.s-link').attr('href'),
authorReputation = $(element).find('.s-user-card--rep').text().replace(/\s+/g, ' ').trim(),
questionDescription = $(element).find('.s-post-summary--content-excerpt').text().replace(/\s+/g, ' ').trim(),
time = $(element).find('.s-user-card--time').text().replace(/\s+/g, ' ').trim(),
votes = $(element)
.find('.js-post-summary-stats .s-post-summary--stats-item:first-child')
.text()
.replace(/\s+/g, ' ')
.trim(),
answers =
$(element).find('.js-post-summary-stats .has-answers').text().replace(/\s+/g, ' ').trim() || '0 answers',
views = $(element)
.find('.js-post-summary-stats .s-post-summary--stats-item:last-child')
.text()
.replace(/\s+/g, ' ')
.trim(),
tags = $(element).find('.js-post-tag-list-item').text();
// Push question data to the parsedData array
parsedData['questions'].push({
question,
authorName,
link: link.includes('https://') ? link : `https://stackoverflow.com${link}`,
authorReputation,
questionDescription,
time,
votes,
answers,
views,
tags,
});
});
// Return the parsed data object
return parsedData;
}