From bfded78151eea569f9814c379e7dfc37ec771982 Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Tue, 16 Sep 2025 15:21:21 +0200 Subject: [PATCH 1/2] Add `sort_nodes_for_upload()` method to Collection class --- pipeline/src/collection.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pipeline/src/collection.py b/pipeline/src/collection.py index 123d51f8..d0faff3d 100644 --- a/pipeline/src/collection.py +++ b/pipeline/src/collection.py @@ -220,3 +220,30 @@ def validate(self, ignore=None): def is_valid(self): failures = self.validate() return len(failures) == 0 + + def sort_nodes_for_upload(self): + """ + Return a list of nodes, sorted so that they can be uploaded to a graph database safely, + i.e., child nodes will be saved before their parents. + + The upload code is assumed to generate @ids and update the Python instances accordingly. + """ + unsorted = set(self.nodes.keys()) + sorted = [] + # initial step: move nodes with no children (downstream links) directly to `sorted` + for node_id in unsorted: + if len(self.nodes[node_id].links) == 0: + sorted.append(node_id) + for node_id in sorted: + unsorted.remove(node_id) + # now iteratively add nodes to `sorted` if all their children are already in `sorted` + while len(unsorted) > 0: + newly_sorted = [] + for node_id in unsorted: + child_ids = set(child.id for child in self.nodes[node_id].links) + if not child_ids.difference(sorted): + sorted.append(node_id) + newly_sorted.append(node_id) + for node_id in newly_sorted: + unsorted.remove(node_id) + return [self.nodes[node_id] for node_id in sorted] \ No newline at end of file From 86914f876403c250780d9078f828484572591152 Mon Sep 17 00:00:00 2001 From: Andrew Davison Date: Wed, 1 Oct 2025 13:04:58 +0200 Subject: [PATCH 2/2] Update pipeline/src/collection.py with Raphael's suggestion Co-authored-by: Raphael-Gazzotti <125291580+Raphael-Gazzotti@users.noreply.github.com> --- pipeline/src/collection.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pipeline/src/collection.py b/pipeline/src/collection.py index d0faff3d..11dfa7df 100644 --- a/pipeline/src/collection.py +++ b/pipeline/src/collection.py @@ -234,8 +234,7 @@ def sort_nodes_for_upload(self): for node_id in unsorted: if len(self.nodes[node_id].links) == 0: sorted.append(node_id) - for node_id in sorted: - unsorted.remove(node_id) + unsorted -= set(sorted) # now iteratively add nodes to `sorted` if all their children are already in `sorted` while len(unsorted) > 0: newly_sorted = [] @@ -244,6 +243,6 @@ def sort_nodes_for_upload(self): if not child_ids.difference(sorted): sorted.append(node_id) newly_sorted.append(node_id) - for node_id in newly_sorted: - unsorted.remove(node_id) - return [self.nodes[node_id] for node_id in sorted] \ No newline at end of file + unsorted -= set(newly_sorted) + return [self.nodes[node_id] for node_id in sorted] + \ No newline at end of file