add basic download/transform implementations

artcz · artcz · commit fc44122072f6 · 2024-05-13T17:26:49.000+02:00
diff --git a/Dockerfile b/Dockerfile
@@ -9,4 +9,4 @@ COPY src/ ./src/
 COPY Makefile .
 
 
-CMD ["make", "update"]
+CMD ["make", "all"]
diff --git a/Makefile b/Makefile
@@ -10,9 +10,11 @@ deps/install:
 
 install: deps/install
 
-update:
-	mkdir -p data/
-	python src/save.py
-
 download:
 	cd src && python download.py
+
+transform:
+	cd src && python transform.py
+
+
+all: download transform
diff --git a/src/download.py b/src/download.py
@@ -21,8 +21,10 @@ class Config:
 base_url = f"https://pretalx.com/api/events/{Config.event}/"
 
 resources = [
-    "submissions",
-    "speakers",
+    # Questions needs to be passed to include answers in the same endpoint,
+    # saving us later time with joining the answers.
+    "submissions?questions=all",
+    "speakers?questions=all",
 ]
 
 for resource in resources:
@@ -40,9 +42,9 @@ class Config:
         data = response.json()
         res0 += data["results"]
 
-    fnames = [
-        f"../data/raw/{Config.event}/{resource}_latest.json",
-    ]
-    for fname in fnames:
-        with open(fname, "w") as fd:
-            json.dump(res0, fd)
+    filename = resource.split("?")[0]  # To get rid of "?questions"
+    filename = f"{filename}_latest.json"
+    filepath = f"../data/raw/{Config.event}/{filename}"
+
+    with open(filepath, "w") as fd:
+        json.dump(res0, fd)
diff --git a/src/transform.py b/src/transform.py
@@ -0,0 +1,232 @@
+import json
+import os
+from collections import defaultdict
+from datetime import date, datetime, time, timedelta
+
+from pydantic import BaseModel
+from pydantic.class_validators import root_validator
+from slugify import slugify
+
+
+class SpeakerQuestion:
+    affiliation = "Company / Organization / Educational Institution"
+    homepage = "Social (Homepage)"
+    twitter = "Social (X/Twitter)"
+    mastodon = "Social (Mastodon)"
+
+
+class SubmissionQuestion:
+    outline = "Outline"
+    tweet = "Abstract as a tweet / toot"
+
+
+class SubmissionState:
+    accepted = "accepted"
+    confirmed = "confirmed"
+    withdrawn = "withdrawn"
+
+
+class PretalxAnswer(BaseModel):
+    question_text: str
+    answer_text: str
+    answer_file: str | None
+    submission_id: str | None
+    speaker_id: str | None
+
+    @root_validator(pre=True)
+    def extract(cls, values):
+        values["question_text"] = values["question"]["question"]["en"]
+        values["answer_text"] = values["answer"]
+        values["answer_file"] = values["answer_file"]
+        values["submission_id"] = values["submission"]
+        values["speaker_id"] = values["person"]
+        return values
+
+
+class PretalxSpeaker(BaseModel):
+    code: str
+    name: str
+    biography: str | None
+    avatar: str | None
+    slug: str
+    answers: list[PretalxAnswer]
+    submissions: list[str]
+
+    # Extracted
+    affiliation: str | None = None
+    homepage: str | None = None
+    twitter: str | None = None
+    mastodon: str | None = None
+
+    @root_validator(pre=True)
+    def extract(cls, values):
+        values["slug"] = slugify(values["name"])
+
+        answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
+
+        for answer in answers:
+            if answer.question_text == SpeakerQuestion.affiliation:
+                values["affiliation"] = answer.answer_text
+
+            if answer.question_text == SpeakerQuestion.homepage:
+                values["homepage"] = answer.answer_text
+
+            # NOTE: in practice the format of the data here is different,
+            # depending on the speaker. We could fix this here by parsing the
+            # the answer_text to some standardised format (either @handle or
+            # https://twitter.com/handle url, etc)
+            if answer.question_text == SpeakerQuestion.twitter:
+                values["twitter"] = answer.answer_text
+
+            if answer.question_text == SpeakerQuestion.mastodon:
+                values["mastodon"] = answer.answer_text
+
+        # Remove all the other answers
+        # This is important, because some answers might contain non-public
+        # information
+        values["answers"] = []
+
+        return values
+
+
+class PretalxSubmission(BaseModel):
+    code: str
+    title: str
+    speakers: list[str]  # We only want the code, not the full info
+    submission_type: str
+    slug: str
+    track: str | None
+    state: str
+    abstract: str
+    answers: list[PretalxAnswer]
+    tweet: str = ""
+    outline: str
+    duration: str
+
+    level: str = ""
+    delivery: str | None = ""
+
+    # This is embedding a slot inside a submission for easier lookup later
+    room: str | None = None
+    start: datetime | None = None
+    end: datetime | None = None
+
+    # TODO: once we have schedule data then we can prefill those in the code
+    # here
+    talks_in_parallel: list[str] | None = None
+    talks_after: list[str] | None = None
+    next_talk_code: str | None = None
+    prev_talk_code: str | None = None
+
+    website_url: str | None = None
+
+    @root_validator(pre=True)
+    def extract(cls, values):
+        # # SubmissionType and Track have localised names. For this project we
+        # # only care about their english versions, so we can extract them here
+        for field in ["submission_type", "track"]:
+            if values[field] is None:
+                continue
+            else:
+                # In 2024 some of those are localised, and some are not.
+                # Instead of figuring out why and fixing the data, there's this
+                # hack:
+                if isinstance(values[field], dict):
+                    values[field] = values[field]["en"]
+
+        values["speakers"] = {s["code"] for s in values["speakers"]}
+
+        answers = [PretalxAnswer.parse_obj(ans) for ans in values["answers"]]
+
+        for answer in answers:
+            if answer.question_text == SubmissionQuestion.outline:
+                values["outline"] = answer.answer_text
+            if answer.question_text == SubmissionQuestion.tweet:
+                values["tweet"] = answer.answer_text
+
+            # TODO if we need any other questions
+
+        slug = slugify(values["title"])
+        values["slug"] = slug
+        values["website_url"] = f"https://ep2024.europython.eu/session/{slug}"
+
+        # Remove all the other answers
+        # This is important, because some answers might contain non-public
+        # information
+        values["answers"] = []
+
+        return values
+
+    @property
+    def is_accepted(self):
+        return self.state == SubmissionState.accepted
+
+    @property
+    def is_confirmed(self):
+        return self.state == SubmissionState.confirmed
+
+    @property
+    def is_publishable(self):
+        return self.is_accepted or self.is_confirmed
+
+
+def parse_submissions() -> list[PretalxSubmission]:
+    """
+    Returns only confirmed talks
+    """
+    with open("../data/raw/europython-2024/submissions_latest.json") as fd:
+        js = json.load(fd)
+        subs = []
+        for item in js:
+            sub = PretalxSubmission.parse_obj(item)
+            subs.append(sub)
+
+    return subs
+
+
+def parse_speakers() -> list[PretalxSpeaker]:
+    """
+    Returns only speakers with confirmed talks
+    """
+    with open("../data/raw/europython-2024/speakers_latest.json") as fd:
+        js = json.load(fd)
+        speakers = []
+        for item in js:
+            speaker = PretalxSpeaker.parse_obj(item)
+            speakers.append(speaker)
+
+    return speakers
+
+
+def publishable_submissions() -> dict[str, PretalxSubmission]:
+    return {s.code: s for s in parse_submissions() if s.is_publishable}
+
+
+def publishable_speakers(accepted_proposals: set[str]) -> dict[str, PretalxSpeaker]:
+    sp = parse_speakers()
+    output = {}
+    for speaker in sp:
+        accepted = set(speaker.submissions) & accepted_proposals
+        if accepted:
+            # Overwrite with only the accepted proposals
+            speaker.submissions = list(accepted)
+            output[speaker.code] = speaker
+
+    return output
+
+
+print(len(parse_submissions()))
+print(len(accepted := publishable_submissions()))
+
+print(len(parse_speakers()))
+print(len(publishable_speakers(accepted.keys())))
+
+print(publishable_speakers(accepted.keys()))
+
+
+from pprint import pprint
+
+pprint(accepted)
+
+# Check if all the slugs are unique
+assert len(set(s.slug for s in accepted.values())) == len(accepted)

Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,4 @@ COPY src/ ./src/`
`9`	`9`	`COPY Makefile .`
`10`	`10`
`11`	`11`
`12`		`-CMD ["make", "update"]`
	`12`	`+CMD ["make", "all"]`