Skip to content

Commit fc44122

Browse files
committed
add basic download/transform implementations
1 parent e7e46a0 commit fc44122

File tree

4 files changed

+249
-13
lines changed

4 files changed

+249
-13
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ COPY src/ ./src/
99
COPY Makefile .
1010

1111

12-
CMD ["make", "update"]
12+
CMD ["make", "all"]

Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@ deps/install:
1010

1111
install: deps/install
1212

13-
update:
14-
mkdir -p data/
15-
python src/save.py
16-
1713
download:
1814
cd src && python download.py
15+
16+
transform:
17+
cd src && python transform.py
18+
19+
20+
all: download transform

src/download.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ class Config:
2121
base_url = f"https://pretalx.com/api/events/{Config.event}/"
2222

2323
resources = [
24-
"submissions",
25-
"speakers",
24+
# Questions needs to be passed to include answers in the same endpoint,
25+
# saving us later time with joining the answers.
26+
"submissions?questions=all",
27+
"speakers?questions=all",
2628
]
2729

2830
for resource in resources:
@@ -40,9 +42,9 @@ class Config:
4042
data = response.json()
4143
res0 += data["results"]
4244

43-
fnames = [
44-
f"../data/raw/{Config.event}/{resource}_latest.json",
45-
]
46-
for fname in fnames:
47-
with open(fname, "w") as fd:
48-
json.dump(res0, fd)
45+
filename = resource.split("?")[0] # To get rid of "?questions"
46+
filename = f"{filename}_latest.json"
47+
filepath = f"../data/raw/{Config.event}/{filename}"
48+
49+
with open(filepath, "w") as fd:
50+
json.dump(res0, fd)

src/transform.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
import json
2+
import os
3+
from collections import defaultdict
4+
from datetime import date, datetime, time, timedelta
5+
6+
from pydantic import BaseModel
7+
from pydantic.class_validators import root_validator
8+
from slugify import slugify
9+
10+
11+
class SpeakerQuestion:
12+
affiliation = "Company / Organization / Educational Institution"
13+
homepage = "Social (Homepage)"
14+
twitter = "Social (X/Twitter)"
15+
mastodon = "Social (Mastodon)"
16+
17+
18+
class SubmissionQuestion:
19+
outline = "Outline"
20+
tweet = "Abstract as a tweet / toot"
21+
22+
23+
class SubmissionState:
24+
accepted = "accepted"
25+
confirmed = "confirmed"
26+
withdrawn = "withdrawn"
27+
28+
29+
class PretalxAnswer(BaseModel):
30+
question_text: str
31+
answer_text: str
32+
answer_file: str | None
33+
submission_id: str | None
34+
speaker_id: str | None
35+
36+
@root_validator(pre=True)
37+
def extract(cls, values):
38+
values["question_text"] = values["question"]["question"]["en"]
39+
values["answer_text"] = values["answer"]
40+
values["answer_file"] = values["answer_file"]
41+
values["submission_id"] = values["submission"]
42+
values["speaker_id"] = values["person"]
43+
return values
44+
45+
46+
class PretalxSpeaker(BaseModel):
47+
code: str
48+
name: str
49+
biography: str | None
50+
avatar: str | None
51+
slug: str
52+
answers: list[PretalxAnswer]
53+
submissions: list[str]
54+
55+
# Extracted
56+
affiliation: str | None = None
57+
homepage: str | None = None
58+
twitter: str | None = None
59+
mastodon: str | None = None
60+
61+
@root_validator(pre=True)
62+
def extract(cls, values):
63+
values["slug"] = slugify(values["name"])
64+
65+
answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
66+
67+
for answer in answers:
68+
if answer.question_text == SpeakerQuestion.affiliation:
69+
values["affiliation"] = answer.answer_text
70+
71+
if answer.question_text == SpeakerQuestion.homepage:
72+
values["homepage"] = answer.answer_text
73+
74+
# NOTE: in practice the format of the data here is different,
75+
# depending on the speaker. We could fix this here by parsing the
76+
# the answer_text to some standardised format (either @handle or
77+
# https://twitter.com/handle url, etc)
78+
if answer.question_text == SpeakerQuestion.twitter:
79+
values["twitter"] = answer.answer_text
80+
81+
if answer.question_text == SpeakerQuestion.mastodon:
82+
values["mastodon"] = answer.answer_text
83+
84+
# Remove all the other answers
85+
# This is important, because some answers might contain non-public
86+
# information
87+
values["answers"] = []
88+
89+
return values
90+
91+
92+
class PretalxSubmission(BaseModel):
93+
code: str
94+
title: str
95+
speakers: list[str] # We only want the code, not the full info
96+
submission_type: str
97+
slug: str
98+
track: str | None
99+
state: str
100+
abstract: str
101+
answers: list[PretalxAnswer]
102+
tweet: str = ""
103+
outline: str
104+
duration: str
105+
106+
level: str = ""
107+
delivery: str | None = ""
108+
109+
# This is embedding a slot inside a submission for easier lookup later
110+
room: str | None = None
111+
start: datetime | None = None
112+
end: datetime | None = None
113+
114+
# TODO: once we have schedule data then we can prefill those in the code
115+
# here
116+
talks_in_parallel: list[str] | None = None
117+
talks_after: list[str] | None = None
118+
next_talk_code: str | None = None
119+
prev_talk_code: str | None = None
120+
121+
website_url: str | None = None
122+
123+
@root_validator(pre=True)
124+
def extract(cls, values):
125+
# # SubmissionType and Track have localised names. For this project we
126+
# # only care about their english versions, so we can extract them here
127+
for field in ["submission_type", "track"]:
128+
if values[field] is None:
129+
continue
130+
else:
131+
# In 2024 some of those are localised, and some are not.
132+
# Instead of figuring out why and fixing the data, there's this
133+
# hack:
134+
if isinstance(values[field], dict):
135+
values[field] = values[field]["en"]
136+
137+
values["speakers"] = {s["code"] for s in values["speakers"]}
138+
139+
answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
140+
141+
for answer in answers:
142+
if answer.question_text == SubmissionQuestion.outline:
143+
values["outline"] = answer.answer_text
144+
if answer.question_text == SubmissionQuestion.tweet:
145+
values["tweet"] = answer.answer_text
146+
147+
# TODO if we need any other questions
148+
149+
slug = slugify(values["title"])
150+
values["slug"] = slug
151+
values["website_url"] = f"https://ep2024.europython.eu/session/{slug}"
152+
153+
# Remove all the other answers
154+
# This is important, because some answers might contain non-public
155+
# information
156+
values["answers"] = []
157+
158+
return values
159+
160+
@property
161+
def is_accepted(self):
162+
return self.state == SubmissionState.accepted
163+
164+
@property
165+
def is_confirmed(self):
166+
return self.state == SubmissionState.confirmed
167+
168+
@property
169+
def is_publishable(self):
170+
return self.is_accepted or self.is_confirmed
171+
172+
173+
def parse_submissions() -> list[PretalxSubmission]:
174+
"""
175+
Returns only confirmed talks
176+
"""
177+
with open("../data/raw/europython-2024/submissions_latest.json") as fd:
178+
js = json.load(fd)
179+
subs = []
180+
for item in js:
181+
sub = PretalxSubmission.parse_obj(item)
182+
subs.append(sub)
183+
184+
return subs
185+
186+
187+
def parse_speakers() -> list[PretalxSpeaker]:
188+
"""
189+
Returns only speakers with confirmed talks
190+
"""
191+
with open("../data/raw/europython-2024/speakers_latest.json") as fd:
192+
js = json.load(fd)
193+
speakers = []
194+
for item in js:
195+
speaker = PretalxSpeaker.parse_obj(item)
196+
speakers.append(speaker)
197+
198+
return speakers
199+
200+
201+
def publishable_submissions() -> dict[str, PretalxSubmission]:
202+
return {s.code: s for s in parse_submissions() if s.is_publishable}
203+
204+
205+
def publishable_speakers(accepted_proposals: set[str]) -> dict[str, PretalxSpeaker]:
206+
sp = parse_speakers()
207+
output = {}
208+
for speaker in sp:
209+
accepted = set(speaker.submissions) & accepted_proposals
210+
if accepted:
211+
# Overwrite with only the accepted proposals
212+
speaker.submissions = list(accepted)
213+
output[speaker.code] = speaker
214+
215+
return output
216+
217+
218+
print(len(parse_submissions()))
219+
print(len(accepted := publishable_submissions()))
220+
221+
print(len(parse_speakers()))
222+
print(len(publishable_speakers(accepted.keys())))
223+
224+
print(publishable_speakers(accepted.keys()))
225+
226+
227+
from pprint import pprint
228+
229+
pprint(accepted)
230+
231+
# Check if all the slugs are unique
232+
assert len(set(s.slug for s in accepted.values())) == len(accepted)

0 commit comments

Comments
 (0)