-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsubstack_scraper.py
More file actions
136 lines (114 loc) · 4.55 KB
/
substack_scraper.py
File metadata and controls
136 lines (114 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Purpose:
Scrape paid subscription articles from a Substack newsletter, saving both HTML and Markdown versions.
You must edit BASE_URL and SITEMAP_STRING below to match your target newsletter.
Instructions:
- Set BASE_URL to the newsletter's main URL (e.g., "https://newsletter.eng-leadership.com")
- Set SITEMAP_STRING to the sitemap path (e.g., "/sitemap.xml")
- Use --paid flag to enable scraping paid content (manual login required)
"""
import requests
from bs4 import BeautifulSoup
import lxml
import markdownify
import json
from selenium import webdriver
from time import sleep
import argparse
BASE_URL = "https://newsletter.eng-leadership.com" # Change to your newsletter base URL
SITEMAP_STRING = "/sitemap.xml" # Change if your sitemap path is different
SITEMAP_URL = BASE_URL + SITEMAP_STRING
OUTPUT_FILE = "articles.json"
def selenium_login():
driver = webdriver.Chrome()
driver.get("https://substack.com/sign-in")
input("After you have logged in and see your account, press Enter here to continue...")
print("Continuing with scraping...")
return driver
def get_article_urls_and_lastmod(sitemap_url):
resp = requests.get(sitemap_url)
soup = BeautifulSoup(resp.content, "xml")
url_to_lastmod = {}
urls = []
for url_tag in soup.find_all("url"):
loc = url_tag.find("loc")
lastmod = url_tag.find("lastmod")
if loc:
url_text = loc.text
urls.append(url_text)
url_to_lastmod[url_text] = lastmod.text if lastmod else ""
return urls, url_to_lastmod
def extract_article_html_and_md(soup):
# Prioritize content containers
article = soup.find("div", class_="available-content")
html_content = str(article)
markdown_content = markdownify.markdownify(html_content, heading_style="ATX")
return html_content, markdown_content
def scrape_article_selenium(driver, url):
driver.get(url)
sleep(0.3)
soup = BeautifulSoup(driver.page_source, "lxml")
return extract_article_html_and_md(soup)
def scrape_article_requests(url):
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
return extract_article_html_and_md(soup)
def main():
parser = argparse.ArgumentParser(description="Substack scraper")
parser.add_argument("--paid", action="store_true", help="Enable scraping paid content (manual login required)")
args = parser.parse_args()
driver = None
if args.paid:
print("Paid mode enabled. Manual login required.")
driver = selenium_login()
else:
print("Paid mode not enabled. Scraping free content only.")
print("Fetching sitemap...")
urls, url_to_lastmod = get_article_urls_and_lastmod(SITEMAP_URL)
print(f"Found {len(urls)} articles.")
with open("urls.txt", "w") as url_file:
for url in urls:
url_file.write(url + "\n")
print(f"Saved URLs to urls.txt")
# Create folders for html and md files, clearing them first
import os
import shutil
html_dir = "html_files"
md_dir = "md_files"
# Remove all files in html_files and md_files if they exist
for folder in [html_dir, md_dir]:
if os.path.exists(folder):
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
if os.path.isfile(file_path):
os.remove(file_path)
else:
os.makedirs(folder, exist_ok=True)
results = []
# for url in urls[:5]: # to test on less articles
for url in urls:
print(f"Scraping {url}")
if args.paid:
html, md = scrape_article_selenium(driver, url)
else:
html, md = scrape_article_requests(url)
if html and md:
lastmod = url_to_lastmod.get(url, "")
date_part = lastmod.split("T")[0] if lastmod else ""
base_name = url.rstrip("/").split("/")[-1]
if date_part:
base_name = f"{date_part}_{base_name}"
html_path = os.path.join(html_dir, base_name + ".html")
md_path = os.path.join(md_dir, base_name + ".md")
with open(html_path, "w", encoding="utf-8") as f_html:
f_html.write(html)
with open(md_path, "w", encoding="utf-8") as f_md:
f_md.write(md)
results.append({"url": url, "html_file": html_path, "md_file": md_path})
with open(OUTPUT_FILE, "w") as f:
json.dump(results, f, indent=2)
print(f"Saved {len(results)} articles to {OUTPUT_FILE}")
if driver:
driver.quit()
if __name__ == "__main__":
main()