scrape-stackoverflow-questions/custom_scraper.js at main · ScraperHub/scrape-stackoverflow-questions · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
const { CrawlingAPI } = require('crawlbase');
const cheerio = require('cheerio');
const fs = require('fs');

const NORMAL_TOKEN = '<Normal requests token>';
const URL = 'https://stackoverflow.com/questions/tagged/javascript';

// Initialize CrawlingAPI with the provided token
const api = new CrawlingAPI({ token: NORMAL_TOKEN }); // Replace it with your Crawlbase Token
// Make a request to the specified URL
api
  .get(URL)
  .then((response) => {
    const parsedData = getParsedData(response.body);
    const prettyJson = JSON.stringify(parsedData, null, 2)
    console.log(prettyJson);
    fs.writeFileSync('sample-custom-scraper-output.json', prettyJson);
  })
  // Handle errors if the request fails
  .catch(console.error);

// Function to parse the HTML content and extract relevant information
function getParsedData(html) {
  // Load HTML content with Cheerio
  const $ = cheerio.load(html),
    // Initialize an object to store parsed data
    parsedData = {
      title: '',
      description: '',
      totalQuestions: 0,
      questions: [],
      currentPage: 0,
    };

  // Extract main information about the page
  parsedData['title'] = $('meta[name="twitter:title"]').attr('content').replace(/\s+/g, ' ').trim();
  parsedData['description'] = $('meta[name="twitter:description"]').attr('content');
  parsedData['totalQuestions'] = $('div[data-controller="se-uql"] .fs-body3').text().replace(/\s+/g, ' ').trim();
  parsedData['currentPage'] = $('.s-pagination.float-left .s-pagination--item.is-selected')
    .text()
    .replace(/\s+/g, ' ')
    .trim();

  // Extract data for each question on the page
  $('#questions .js-post-summary').each((_, element) => {
    // Extract other properties for the question
    const question = $(element).find('.s-post-summary--content-title').text().replace(/\s+/g, ' ').trim(),
      authorName = $(element).find('.s-user-card--link').text().replace(/\s+/g, ' ').trim(),
      link = $(element).find('.s-link').attr('href'),
      authorReputation = $(element).find('.s-user-card--rep').text().replace(/\s+/g, ' ').trim(),
      questionDescription = $(element).find('.s-post-summary--content-excerpt').text().replace(/\s+/g, ' ').trim(),
      time = $(element).find('.s-user-card--time').text().replace(/\s+/g, ' ').trim(),
      votes = $(element)
        .find('.js-post-summary-stats .s-post-summary--stats-item:first-child')
        .text()
        .replace(/\s+/g, ' ')
        .trim(),
      answers =
        $(element).find('.js-post-summary-stats .has-answers').text().replace(/\s+/g, ' ').trim() || '0 answers',
      views = $(element)
        .find('.js-post-summary-stats .s-post-summary--stats-item:last-child')
        .text()
        .replace(/\s+/g, ' ')
        .trim(),
      tags = $(element).find('.js-post-tag-list-item').text();

    // Push question data to the parsedData array
    parsedData['questions'].push({
      question,
      authorName,
      link: link.includes('https://') ? link : `https://stackoverflow.com${link}`,
      authorReputation,
      questionDescription,
      time,
      votes,
      answers,
      views,
      tags,
    });
  });

  // Return the parsed data object
  return parsedData;
}