diff --git a/content/WhatIsAnARG_module.py b/content/WhatIsAnARG_module.py
new file mode 100644
index 0000000..8831c95
--- /dev/null
+++ b/content/WhatIsAnARG_module.py
@@ -0,0 +1,179 @@
+import collections
+import itertools
+import sys
+
+import numpy as np
+from IPython.display import HTML
+
+def remove_edges(ts, edge_id_remove_list):
+ edges_to_remove_by_child = collections.defaultdict(list)
+ edge_id_remove_list = set(edge_id_remove_list)
+ for m in ts.mutations():
+ if m.edge in edge_id_remove_list:
+ # If we remove this edge, we will remove the associated mutation
+ # as the child node won't have ancestral material in this region.
+ # So we force the user to explicitly (re)move the mutations beforehand
+ raise ValueError("Cannot remove edges that have associated mutations")
+ for remove_edge in edge_id_remove_list:
+ e = ts.edge(remove_edge)
+ edges_to_remove_by_child[e.child].append(e)
+
+ # sort left-to-right for each child
+ for k, v in edges_to_remove_by_child.items():
+ edges_to_remove_by_child[k] = sorted(v, key=lambda e: e.left)
+ # check no overlaps
+ for e1, e2 in zip(edges_to_remove_by_child[k], edges_to_remove_by_child[k][1:]):
+ assert e1.right <= e2.left
+
+ # Sanity check: this means the topmost node will deal with modified edges left at the end
+ assert ts.edge(-1).parent not in edges_to_remove_by_child
+
+ new_edges = collections.defaultdict(list)
+ tables = ts.dump_tables()
+ tables.edges.clear()
+ samples = set(ts.samples())
+ # Edges are sorted by parent time, youngest first, so we can iterate over
+ # nodes-as-parents visiting children before parents by using itertools.groupby
+ for parent_id, ts_edges in itertools.groupby(ts.edges(), lambda e: e.parent):
+ # Iterate through the ts edges *plus* the polytomy edges we created in previous steps.
+ # This allows us to re-edit polytomy edges when the edges_to_remove are stacked
+ edges = list(ts_edges)
+ if parent_id in new_edges:
+ edges += new_edges.pop(parent_id)
+ if parent_id in edges_to_remove_by_child:
+ for e in edges:
+ assert parent_id == e.parent
+ l = -1
+ if e.id in edge_id_remove_list:
+ continue
+ # NB: we go left to right along the target edges, reducing edge e as required
+ for target_edge in edges_to_remove_by_child[parent_id]:
+ # As we go along the target_edges, gradually split e into chunks.
+ # If edge e is in the target_edge region, change the edge parent
+ assert target_edge.left > l
+ l = target_edge.left
+ if e.left >= target_edge.right:
+ # This target edge is entirely to the LHS of edge e, with no overlap
+ continue
+ elif e.right <= target_edge.left:
+ # This target edge is entirely to the RHS of edge e with no overlap.
+ # Since target edges are sorted by left coord, all other target edges
+ # are to RHS too, and we are finished dealing with edge e
+ tables.edges.append(e)
+ e = None
+ break
+ else:
+ # Edge e must overlap with current target edge somehow
+ if e.left < target_edge.left:
+ # Edge had region to LHS of target
+ # Add the left hand section (change the edge right coord)
+ tables.edges.add_row(left=e.left, right=target_edge.left, parent=e.parent, child=e.child)
+ e = e.replace(left=target_edge.left)
+ if e.right > target_edge.right:
+ # Edge continues after RHS of target
+ assert e.left < target_edge.right
+ new_edges[target_edge.parent].append(
+ e.replace(right=target_edge.right, parent=target_edge.parent)
+ )
+ e = e.replace(left=target_edge.right)
+ else:
+ # No more of edge e to RHS
+ assert e.left < e.right
+ new_edges[target_edge.parent].append(e.replace(parent=target_edge.parent))
+ e = None
+ break
+ if e is not None:
+ # Need to add any remaining regions of edge back in
+ tables.edges.append(e)
+ else:
+ # NB: sanity check at top means that the oldest node will have no edges above,
+ # so the last iteration should hit this branch
+ for e in edges:
+ if e.id not in edge_id_remove_list:
+ tables.edges.append(e)
+ assert len(new_edges) == 0
+ tables.sort()
+ return tables.tree_sequence()
+
+def unsupported_edges(ts, per_interval=False):
+ """
+ Return the internal edges that are unsupported by a mutation.
+ If ``per_interval`` is True, each interval needs to be supported,
+ otherwise, a mutation on an edge (even if there are multiple intervals
+ per edge) will result in all intervals on that edge being treated
+ as supported.
+ """
+ edges_to_remove = np.ones(ts.num_edges, dtype="bool")
+ edges_to_remove[[m.edge for m in ts.mutations()]] = False
+ # We don't remove edges above samples
+ edges_to_remove[np.isin(ts.edges_child, ts.samples())] = False
+
+ if per_interval:
+ return np.where(edges_to_remove)[0]
+ else:
+ keep = (edges_to_remove == False)
+ for p, c in zip(ts.edges_parent[keep], ts.edges_child[keep]):
+ edges_to_remove[np.logical_and(ts.edges_parent == p, ts.edges_child == c)] = False
+ return np.where(edges_to_remove)[0]
+
+
+class Workbook:
+ @staticmethod
+ def setup():
+ display(HTML(
+ "" +
+ "
✅ Your notebook is ready to go!
" +
+ ("This notebook is not running in JupyterLite: you may need to install tskit, tszip, etc."
+ if sys.platform != 'emscripten' else '''
+To clear the notebook and reset,
+select "Clear Browser Data" from the JupyterLite help menu.
+''')
+ ))
+
+ def node_coalescence_status(arg):
+ """
+ Uses the num_children_array attribute to find nodes that represent local coalescence.
+ See https://tskit.dev/tskit/docs/latest/python-api.html#tskit.Tree.num_children_array
+ Returns an array of length num_nodes containing 0 if a node never has any coalescent
+ segments, 1 if some segments of the node are coalescent and some unary, and 2 if
+ all node segments represent a local coalescence point.
+ """
+ has_unary = np.zeros(arg.num_nodes, dtype=int)
+ has_coal = np.zeros(arg.num_nodes, dtype=int)
+ tree = arg.first()
+ nca = tree.num_children_array
+ for edge_diffs in arg.edge_diffs():
+ for e in edge_diffs.edges_out:
+ if nca[e.parent] == 0:
+ has_unary[e.parent] = 1
+ elif nca[e.parent] > 1:
+ has_coal[e.parent] = 1
+ for e in edge_diffs.edges_in:
+ if nca[e.parent] == 0:
+ has_unary[e.parent] = 1
+ elif nca[e.parent] > 1:
+ has_coal[e.parent] = 1
+ tree.next()
+ return np.where(has_coal, np.where(has_unary, 1, 2), 0)
+
+ def remove_unsupported_edges(ts, per_interval=True):
+ """
+ Remove edges from the tree sequence that are unsupported by mutations.
+ If ``per_interval`` is True, each interval needs to be supported,
+ otherwise, a mutation on an edge (even if there are multiple intervals
+ per edge) will result in all intervals on that edge being treated
+ as supported.
+ """
+ edges_to_remove = unsupported_edges(ts, per_interval=per_interval)
+ tables = remove_edges(ts, edges_to_remove).dump_tables()
+ tables.edges.squash()
+ return tables.tree_sequence()
+
+class Workbook1(Workbook):
+ url = "json/WhatIsAnARG/Workbook1/" # could also put a full URL here, but a local one will work offline too
+
+class Workbook2(Workbook):
+ url = "json/WhatIsAnARG/Workbook2/" # could also put a full URL here, but a local one will work offline too
diff --git a/content/WhatIsAnARG_workbook1.ipynb b/content/WhatIsAnARG_workbook1.ipynb
new file mode 100644
index 0000000..b34cc59
--- /dev/null
+++ b/content/WhatIsAnARG_workbook1.ipynb
@@ -0,0 +1,1041 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d839c4bf-ceff-4e60-b93b-c87ad7667ef3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "if sys.platform != 'emscripten':\n",
+ " # allow the notebook to be downloaded and run locally too\n",
+ " print(\"This notebook is not running in JupyterLite: you may need to install tskit, tszip, etc.\")\n",
+ "else:\n",
+ " import micropip\n",
+ " await micropip.install('jupyterquiz')\n",
+ " await micropip.install('tskit_arg_visualizer')\n",
+ "\n",
+ "from jupyterquiz import display_quiz\n",
+ "from WhatIsAnARG_module import Workbook1\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "404f5dec-3289-4ef5-b532-ae0b2eff15f3",
+ "metadata": {},
+ "source": [
+ "# ARGs and _tskit_: an introduction\n",
+ "\n",
+ "### Background reading and skills required\n",
+ "\n",
+ "ARGs provide a principled way of thinking about genetic inheritance with recombination: i.e. recombinant phylogenies. Some recent reviews are [Lewanski at al. (2024)](https://doi.org/10.1371/journal.pgen.1011110) and [Nielsen et al. (2024)](https://doi.org/10.1038/s41576-024-00772-4). To complete this workbook you should be comfortable with basic programming in [Python](https://www.python.org) (including e.g. [list comprehensions](https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions) and [dictionaries / dict comprehensions](https://docs.python.org/3/tutorial/datastructures.html#dictionaries)). The workbook also makes substantial use of the \"numerical Python\" library, [`numpy`](https://numpy.org): there is a quickstart tutorial [here](https://numpy.org/devdocs/user/quickstart.html). It may also help if you have some familiarity with the basics of the [Tree Sequence Toolkit (_tskit_)](https://tskit.dev/genetics-research/), and its Python interface, by reading https://tskit.dev/tutorials/what_is.html, https://tskit.dev/tutorials/args.html and https://tskit.dev/tutorials/no_mutations.html.\n",
+ "\n",
+ "### Overview\n",
+ "This is the first of 2 workbooks: each is intended to take about 1½-2 hours to complete.\n",
+ "\n",
+ "\n",
+ "- Workbook 1
\n",
+ " - Graphs and local trees, mutations, genetic variation and statistical calculations: exercises A-J & questions 1-11
\n",
+ "- Workbook 2
\n",
+ " - Types of ARG, simplification, and ARG simulation: exercises K-U & questions 12-30
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "521cd35b-a4b5-40c8-981c-652aa621832d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Workbook1.setup()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa72ab24-0871-4111-9d3e-6a8c156f1b41",
+ "metadata": {},
+ "source": [
+ "# Workbook 1: ARGs, local trees, and genetic variation\n",
+ "\n",
+ "## _Tskit_ basics\n",
+ "\n",
+ "The _tskit_ library allows you to store and analyse ARGs in the *tree sequence* format. This is the library that underlies simulators such as _msprime_ and _SLiM_, and inference tools such as _tsinfer_, _tsdate_, and _SINGER_. The library is extensive and well-documented at https://tskit.dev/tskit/docs/stable/. We will use the [Python interface](https://tskit.dev/tskit/docs/stable/python-api.html), but there are also C and Rust inferfaces, and it is possible to use _tskit_ from [within R]() as well.\n",
+ "\n",
+ "First, we load a small simulated ARG in tree sequence format. This is a \"full ARG\" which describes the ancestry of a set of 10 sampled genomes and includes nodes that explicitly record recombination events (so-called \"recombination nodes\").\n",
+ "\n",
+ "It is conventional to store such genealogies in a variable called `arg` or `ts`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fdb6e2f4-c0f7-4bf1-a129-9ea9fa4eb57b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tskit\n",
+ "arg = tskit.load(\"data/example_ARG.trees\") # it is conventional to use `arg` or `ts` as the standard variable\n",
+ "print(\"A tskit ARG has been loaded into the variable 'arg'\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86833468-8d09-42e4-9797-7132f0f7af18",
+ "metadata": {},
+ "source": [
+ "It is important to distinguish the *sample genomes* in this ARG. Their numerical IDs can be obtained using the `.samples()` method. Often these will be sequential, starting from zero. However, this need not always be the case so it's always best to check:\n",
+ "Note: For those coming from R, tskit uses the Python convention of indexing from 0 rather than from 1. So the first node has ID 0, not ID 1
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "78dc246a-73a7-4571-b846-247488810954",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"The sampled genomes have IDs:\", arg.samples())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dfdf625f-7ad6-485a-aa77-1b865eb7e30d",
+ "metadata": {},
+ "source": [
+ "## ARGs and local trees\n",
+ "\n",
+ "The [_tskit_arg_visualizer_](https://github.com/kitchensjn/tskit_arg_visualizer) software uses the [D3js library](https://d3js.org) to visualise ARGs and other tree sequences interactively, in a browser or Jupyter notebook. As is conventional, the oldest nodes are drawn at the top, with the youngest, usually at time 0, at the bottom.\n",
+ "\n",
+ "It works by creating a new [`D3ARG`](https://github.com/kitchensjn/tskit_arg_visualizer/blob/main/docs/tutorial.md#what-is-a-d3arg) object from the _tskit_ ARG. This `D3ARG` object can then be plotted using `.draw()`.\n",
+ "\n",
+ "Note: You'll see that some nodes in this plot have two IDs. Don't worry about this: as we'll see later it's because the simulator has represented recombination using 2 nodes, which have been overlaid in the visualizer
\n",
+ "\n",
+ "- Exercise A
\n",
+ " - Try tidying-up the plot below by dragging nodes horizontally. You can save the result if you want via the buttons above the graph.
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7b7b2e45-9518-4bbd-a426-7b4eb7bc2557",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tskit_arg_visualizer as argviz\n",
+ "\n",
+ "d3arg = argviz.D3ARG.from_ts(arg) # by convention, the argviz object has \"d3\" prepended to the original name\n",
+ "d3arg.set_node_styles({u: {\"symbol\": \"d3.symbolSquare\"} for u in arg.samples()}) # Set some viz styles\n",
+ "d3arg.draw(edge_type=\"ortho\", width=800); # draw the D3ARG in the notebook: the semicolon hides the return value of the draw() method"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6473209-6dce-45f6-91b6-a0e2fc96802c",
+ "metadata": {},
+ "source": [
+ "## The importance of sample nodes\n",
+ "\n",
+ "Each node in the graph above represents a genome. The **square** nodes representing *sampled genomes* (or \"samples\") and the round \"nonsample\" nodes represent *ancestral genomes*. Importantly, ARGs only represent the ancestry of the sample nodes. For example, ancestor 35 would have passed on a whole genome's worth of DNA to node 34, but if you hover over the edge that joins them, you well see that only part of the genome eventually made it into the samples.\n",
+ "\n",
+ "This means that even if we have the whole genomes of all the sample nodes, the known regions of ancestral genome may not cover the whole sequence length. To put it another way, ancestral genomes may only be partially knowable.\n",
+ "\n",
+ "- Exercise B
\n",
+ " - Hover over a few of the lines in the graph above to see which inherited regions are inherited along different edges. Then try adding
variable_edge_width=True to the .draw() call, to display the widths of edges as proportional to their span. Does this help show the routes through which the majority of the genome has travelled? \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "134503fa-ba13-44c3-8ac7-957741e1d7ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q1.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "39bc5010-1fc4-4830-ad01-6e67f88bb7fe",
+ "metadata": {},
+ "source": [
+ "## SPR operations\n",
+ "\n",
+ "Each left/right transition from one tree to another represents a *recombination breakpoint* where forward in time, two lineages (one from the maternal and one from the paternal genome) were combined via meiosis. Looking backwards in time, this results in one genome (node) having two parents.\n",
+ "\n",
+ "If we know the recombination nodes, it is easy to see how one tree changes into another along the genome. A good approximation is that the tree on one side of a breakpoint can be transformed into the tree on the other side of the breakpoint via a single tree-editing operation, or SPR. \n",
+ "\n",
+ "- Exercise C
\n",
+ " - Move your pointer over the genome bar underneath the graph, and try to get a feel for how one local tree (which will be highlighted in green, embedded in the larger ARG) is transformed into the next one.
\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57a422ed-ca10-4d8b-a8cb-22b30dbe7ac1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q2.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d1f37537-2dd1-4a2b-93c9-d9ddd651746b",
+ "metadata": {},
+ "source": [
+ "## Left-right ARG traversal: the `.trees()` iterator\n",
+ "\n",
+ "A fundamental ARG operation, which forms the basis of operations such as calculating genetic statistics, is to move along the genome, obtaining the local trees at each point. In _tskit_ this is done using the [`.trees()`](https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.trees) method. Each new tree, and the data associated with it, is formed by making a slight change to the previous tree, making this an efficient operation, even for huge trees of millions of nodes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1b8d0caa-f1e6-4f3b-87dd-1a1bc30e446c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for tree in arg.trees():\n",
+ " display(tree)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "628dcc03-d985-4e24-a817-5f8e7ac4cca1",
+ "metadata": {},
+ "source": [
+ "### Visualising local trees\n",
+ "\n",
+ "By default, `tskit` displays each local tree as a summary table, as above. To draw the tree out, you can use the [`.draw_svg()`](https://tskit.dev/tutorials/viz.html#svg-format) method, suitable for small trees of tens or hundreds of nodes each."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "974cd8e4-8db4-41f0-8341-3766a9bb5341",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "arg.draw_svg(\n",
+ " size=(1200, 400), # (width, height) in pixels\n",
+ " omit_sites=True, # Later in the workbook we'll remove this, to reveal the variable sites \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7c614741-f2c2-48d3-ad40-de9fed6bb222",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q3.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd840a92-7216-4315-82b3-be46949572b2",
+ "metadata": {},
+ "source": [
+ "### Branch length changes\n",
+ "Perhaps surprisingly, many recombination events in an ARG do not change the topology of the local trees, but merely change branch lengths (i.e. change the time and identity of the most recent common ancestor, or MRCA, of certain sets of samples). This can be seen most clearly at the top of the graph-based ARG visualization. At 30 000 generations ago there are only two lineages left. Whichever route we take above that through the ARG, will end up joining node 26 and node 23, just at a potentially different time in the past."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dc015258-3a45-4604-a7e4-52bb0c0cee59",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q4.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b8016f3-49ca-4750-a92c-f55562c5c20d",
+ "metadata": {},
+ "source": [
+ "### \"Identical tree\" changes\n",
+ "\n",
+ "Some recombination events change neither the topology nor the branch lengths. This is seen when going from the third to the fourth tree. That change only involves node ID 24 switching to ID 25, which represents the recombination event at the bottom of the diamond. A peculiarity of ARGs created using the _msprime_ simulator is that for [technical reasons](https://tskit.dev/msprime/docs/stable/ancestry.html#recombination-events), recombination events are recorded as *two* simultanous recombination nodes (e.g. nodes 24/25), representing the paternal and maternal genomes prior to meiosis. These are visually merged into a single node in the ARG visualizer. Assuming a single crossover event, one node is to the left and one to the right of the breakpoint."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8b3e5ac3-3595-42a9-98d0-16996f395c98",
+ "metadata": {},
+ "source": [
+ "- Exercise D
\n",
+ " - Copy the
draw_svg() code above into the box below, but plot only the trees between genome position 200 to 800 by using x_lim=(200, 800), and add a Y axis, using y_axis=True. To make the Y axis ticks look nicer, you could also specify y_ticks={0: \"0\", 20_000: \"20k\", 40_000: \"40k\"}. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75f20ce5-96ba-4959-ad45-c849969e95ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise D here\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36f07cea-6e9e-4915-9214-903a8db46e43",
+ "metadata": {},
+ "source": [
+ "## Coalescent and non-coalescent regions\n",
+ "\n",
+ "Looking at the tree-by-tree plot, it should be clear that some of the nodes in a local tree have one child in some trees, and two children in others. There are even some nodes that have only one child in every tree in which they appear (e.g. node 26). We can classify nodes into\n",
+ "\n",
+ "0. **non-coalescent**, sometimes called _always unary_ (i.e. one child in all local trees, e.g. node 26)\n",
+ "1. **part-coalescent**, sometimes called _locally unary_ (i.e. one child in some local trees, coalescent in others, e.g. node 18)\n",
+ "2. **all-coalescent**, or _never unary_ (always 2 or more children in any local tree in which they are found, e.g. node 19).\n",
+ "\n",
+ "Note: This classification will turn out to be important in part B, when we learn about simplifying ARGs by collapsing some nodes (particularly \"non-coalescent\" nodes) depending on whether they are detectable.
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c9603eaa-9a3c-44d9-bdba-ad922ef73bd1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q5.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "470751f7-82b7-4b4c-a487-8128adb6c36b",
+ "metadata": {},
+ "source": [
+ "The `Workbook.node_coalescence_status()` function, written for this workbook, uses the `.trees()` iterator to construct an array denoting the classification of each node. `returned_array[i]` is **0** if node `i` is non-coalescent, **1** if node `i` is part-coalescent, and **2** if node `i` is all-coalescent. You can check below that this agrees with your understanding:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "dd53f945-120c-474b-8d62-43144aa144a5",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "description = {0: \"Non-coalescent\", 1: \"Part-coalescent\", 2: \"All-coalescent\"}\n",
+ "samples = arg.samples()\n",
+ "for node_id, status in enumerate(Workbook1.node_coalescence_status(arg)):\n",
+ " extra = \", sample\" if node_id in samples else \"\"\n",
+ " print(f\"Node {node_id} coalescence status: {description[status]} ({status}{extra})\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cd136763-1591-4e94-a861-414746c675ed",
+ "metadata": {},
+ "source": [
+ "## Nodes and individuals\n",
+ "\n",
+ "You can think of each ARG node as representing a (haploid) genome. Humans (and most other eukaryotes) are *diploid*, meaning they contain 2 genomes: a maternal and a paternal one. \n",
+ "\n",
+ "In _tskit_, this information is stored by allowing a node to be asociated with an individual ID, linked to a separate table of individuals. Usually only sample nodes have positive individual ID. Other nodes have their `individual` ID set to `-1` (also known as `tskit.NULL`); this includes ancestral nodes, as we usually don't know how ancestral genomes were grouped into diploid individuals."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34ceda57-b54a-410c-bf54-fcb05b30ff84",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "colours = {\n",
+ " tskit.NULL: \"LightGrey\", # nodes that are not associated with an individual are coloured in grey\n",
+ " 0: \"LightGreen\",\n",
+ " 1: \"DodgerBlue\", \n",
+ " 2: \"MediumVioletRed\",\n",
+ " 3: \"Coral\",\n",
+ " 4: \"DarkGoldenRod\"\n",
+ "}\n",
+ "for node in arg.nodes():\n",
+ " individual_id = node.individual\n",
+ " print(\n",
+ " f\"Node {node.id} is associated with individual {individual_id}\",\n",
+ " f\"which we will colour {colours[individual_id]}\" if individual_id != tskit.NULL else \"\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1f9ee5a7-b0ca-4b9f-a5e0-9193e797c563",
+ "metadata": {},
+ "source": [
+ "When visualising the ARG (either as a graph or as a series of trees), we don't usually want to group the two nodes of each individual together, as the maternal and paternal genomes within an individual can have quite different histories. You can try to rearrange the sample nodes by colour, to convince yourself that the two samples from a single individual do not trivially group together.\n",
+ "\n",
+ "Note that the following code does not use the \"orthogonal\" plot style, but reverts to the default `edge_type` of `\"line\"`, which simply joins nodes by straight lines. As well as colouring the nodes by individual, it colours recombination nodes as black. It also shows not just the *topology* of the ARG, but also the length (in time) of each edge, by plotting the Y axis on a linear timescale."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "34dd2283-5c5a-4f4a-8723-6e1370275a1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import msprime\n",
+ "\n",
+ "d3arg.set_node_styles({\n",
+ " node.id: {\"fill\": colours[node.individual]}\n",
+ " for node in arg.nodes()\n",
+ "})\n",
+ "d3arg.draw(title=\"Nodes coloured by individual\", y_axis_scale=\"time\", width=800);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0876be3-5bfb-42ee-884d-c844b4c19743",
+ "metadata": {},
+ "source": [
+ "The ARG was actually simulated using a model of human evolution that reflects the Out of Africa event. As well as having a value denoting the individual, each node also has a value indicating a population it belongs to.\n",
+ "\n",
+ "- Exercise E
\n",
+ " - Change the code above to colour by
node.population ID rather than node.individual ID. You could also stop colouring the recombination nodes as black if you like. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21f0b573-2b5c-4d02-ba7a-58026b4ee64b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q6.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "61739bba-c121-461d-928e-033bb5a66420",
+ "metadata": {},
+ "source": [
+ "### Metadata (a side note)\n",
+ "\n",
+ "Tskit uses numerical IDs to refer to nodes, individuals, populations, etc, but it can be tricky to keep track of the mapping between e.g. population name and its corresponding ID. Moreover, when we start modifying tree sequence objects, the IDs can change. For instance, in the ARG we have been using, the node IDs have been ordered so that they align nicely from 0..9 when drawing the ARG. To help keep track of objects, even when their IDs change, tskit allows them to be associated with *metadata*. There's a tutorial on metadata if you need to know more.\n",
+ "\n",
+ "In our example ARG, the metadata for each population contains a name and a fuller description. You can see the this when accessing a population by its ID:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3bce6869-3119-43ea-889d-a699f395911a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pop_id = 0\n",
+ "pop = arg.population(pop_id)\n",
+ "print(pop) # or print(pop.metadata) to see just the metadata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "49ae6107-df87-4ba0-81ba-0737111da346",
+ "metadata": {},
+ "source": [
+ "Or you can see the metadata for all populations by looping over them:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "133abfb7-4d07-4d7c-a26d-be55183930ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for pop in arg.populations():\n",
+ " print(pop)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b504e594-b8d6-4819-8e84-41cd5decc0ed",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sometimes it is helpful to use a mapping of a metadata value to the object ID\n",
+ "pop_name_to_id_map = {pop.metadata['name']: pop.id for pop in arg.populations()}\n",
+ "\n",
+ "# Now use the map to get only those samples from the `AFR` population\n",
+ "print(\n",
+ " \"The sample genomes for the AFR population have IDs:\",\n",
+ " arg.samples(population=pop_name_to_id_map[\"AFR\"])\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "45a7aa32-6ee9-4056-a170-7e9c235996be",
+ "metadata": {},
+ "source": [
+ "- Exercise F
\n",
+ " - Using a
for loop, iterate over arg.individuals() and print our each individual. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d6b7b70c-7a7b-4bf9-a6e7-4b132b906456",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise F here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "27c9c0bf-eb16-4e81-ac3c-7e14d06f07a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q7.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9e09238d-b45a-426a-ad26-ed353cb740f4",
+ "metadata": {},
+ "source": [
+ "## Mutations\n",
+ "\n",
+ "Genetic variation is create by *mutations* on the ARG. So far we have not shown mutations, but we can reveal them by setting `show_mutations` to `True` in the ARG visualizer. If you hover over the orange mutation symbols in the visualization below, their position on the lower genome bar will be revealed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1312cf4d-34b1-48cc-b379-ab7b62bc255e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d3arg.draw(show_mutations=True, y_axis_scale=\"time\", width=800, title=f\"ARG with {arg.num_mutations} mutations\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f94dcef1-920f-4930-a283-63608499d5ad",
+ "metadata": {},
+ "source": [
+ "### Encoding sites and mutations in _tskit_\n",
+ "\n",
+ "In _tskit_, mutations occur at specific *sites*. A site is defined by a *site id*, a *position* and an *ancestral state*. Mutations are defined as an *mutation id*, a *derived state*, an associated *node id* an (optional) *time*, and an associated *site id*. If two mutations have the same site id, those mutations occur at the same position along the genome."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "253e914d-db52-4122-b678-99a55f223e9e",
+ "metadata": {},
+ "source": [
+ "- Exercise G
\n",
+ " - The
include_mutation_labels and condense_mutations parameters provide different ways to display mutations on branches. Add label_mutations=True to the .draw() method above, to see its effect. \n",
+ "
\n",
+ "\n",
+ "- Exercise H
\n",
+ " - In the cell below, use
arg.draw_svg() to plot the mutated arg as a series of trees without using the omit_mutations parameter. You will see that mutations are drawn as red crosses on the trees, and also depicted as red arrows on the x axis, at the appropriate site position. Multiple mutations at the same site will have those arrows stacked. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b8839bd4-66a7-462d-b928-9e798dce6271",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Complete exercise H here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "227ff283-f93d-46bc-aa16-d631d56a23c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q8.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "87345d4f-65be-49d4-887d-44eba3d48bf3",
+ "metadata": {},
+ "source": [
+ "## Genetic variation & the `.variants()` iterator\n",
+ "\n",
+ "The inheritance of mutations through the ARG defines the genotypes at each variable site for each sample. The _tskit_ [`.variants()`](https://tskit.dev/tskit/docs/stable/python-api.html#tskit.TreeSequence.variants) method goes through the local trees, decoding the genetic variation at each site by looking at the inheritance of mutations. For efficiency, the genotypes for all the samples at a site are returned as a numerical vector. Each value in the vector denotes the index into a corresponding list of alleles:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a29715a1-4f22-4590-9507-87b2a612f270",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for variant in arg.variants():\n",
+ " print(\n",
+ " f\"Genotypes at position {variant.site.position}\",\n",
+ " f\"for the {arg.num_samples} samples \",\n",
+ " f\"are {variant.genotypes},\",\n",
+ " f\"denoting the alleles {variant.alleles}\",\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9ddf955b-0b7c-4b4d-980d-e506f8091c77",
+ "metadata": {},
+ "source": [
+ "- Exercise I
\n",
+ " - Change the code above to display the actual allelic states (rather that a numerical value), by using the
variant.states() method in place of accessing variant.genotypes directly. This can be useful for displaying or checking smaller examples, as opposed to numerical analysis of larger datasets. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57dbec68-f09c-476e-a235-9837890fd46e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q9.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e5d18a64-829f-4e6a-bd05-be683e995826",
+ "metadata": {},
+ "source": [
+ "## Mutations allow ARG inference\n",
+ "\n",
+ "Mutations on an ARG generate genetic variation. Conversely, genetic variation among a set of samples can be used to *infer* an ARG that could plausibly have generated the samples. To do this, it is often useful to make the \"infinite sites\" approximation, that every mutation occurs at a different position (as we saw above, this is not always true).\n",
+ "\n",
+ "When we use mutations do infer genealogy, it can be helpful to *polarise* the alleles at a genetic site (figure out which allele is ancestral and which are derived). Then we can reasonably assume that if a set of genomes share the derived allele, they all inherited it from a common ancestor. This allows us to deduce the presence of a common ancestor node in the ARG.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "17a0ee9f-125b-4fa9-a8af-93cec1f5f8ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q10.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd637a11-3bf9-4724-99c0-95b62dc7fdba",
+ "metadata": {},
+ "source": [
+ "## Provenance\n",
+ "\n",
+ "Helpfully, _tskit_ tries to store information on how an ARG was generated. Software like _msprime_, whcih was used to generate this ARG, place details of their commands in the [provenance](https://tskit.dev/tskit/docs/stable/provenance.html) table. We'll use this to check on the mutation rate that was used in the simulation, so that we can use it later:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ca88b44-16b0-46d5-9e83-21e3cb6ffd6a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json # Provenance data is stored in JSON format\n",
+ "last_provenance = json.loads(arg.provenance(-1).record)\n",
+ "assert last_provenance['parameters']['command'] == \"sim_mutations\" # Check the last thing that was done was adding mutations\n",
+ "mutation_rate = last_provenance['parameters']['rate']\n",
+ "print(\"The ARG was generated using a mutation rate of\", mutation_rate, \"mutations per base pair per generation\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aa360968-fb3e-4abc-a94b-28b3056cc91f",
+ "metadata": {},
+ "source": [
+ "## Population genetic statistics\n",
+ "\n",
+ "An ARG with mutations completely summarises the genetic variation present in the samples. Moreover, by only looking at differences between local trees, genome-wide statistical calculations can often be performed much more efficiently than naive approaches. Details of the statistics available in _tskit_ (including genetic divergence, Patterson's f statistics, PCA and genetic relatedness, etc) are at [https://tskit.dev/tskit/docs/stable/stats.html](https://tskit.dev/tskit/docs/stable/stats.html). Many of these statistics are summaries of the _allele frequency spectrum_, which itself can be returned by a single call to the [`.allele_frequency_spectrum()`](https://tskit.dev/tskit/docs/stable/stats.html#sec-stats-notes-afs) method:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d049466-552a-4308-a0cf-bfb5755d006c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from matplotlib import pyplot as plt\n",
+ "\n",
+ "\n",
+ "# Whole genome AFS\n",
+ "afs = arg.allele_frequency_spectrum(polarised=True)\n",
+ "plt.bar(range(arg.num_samples + 1), afs)\n",
+ "plt.xlabel(\"Number of samples inheriting a mutation\")\n",
+ "plt.ylabel(\"Density\")\n",
+ "plt.title(\"Polarised allele frequency spectrum\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e857d4bd-37de-4a44-b36a-8ef25589e23f",
+ "metadata": {},
+ "source": [
+ "- Exercise J
\n",
+ " - Add
span_normalise=False to the .allele_frequency_spectrum() call above, and re-run, so that the Y axis plots the actual number of mutations instead dividing that by the sequence length. Does the number of mutations in all the bars add up to the total number in the ARG? \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c950279-0632-42c7-896f-bda9544b3116",
+ "metadata": {},
+ "source": [
+ "### Windowed statistics\n",
+ "\n",
+ "Statistics can also be windowed along the genome and also by time. Here we demonstrate a simplest measure, the genetic diversity (sometimes known as $\\pi$), and a more complex one, Wright's Fst."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f8047854-99c8-4548-9bf6-259f76f4b648",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Average genetic diversity (π) = {arg.diversity()}\")\n",
+ "AFR_samples = arg.samples(population=0)\n",
+ "EUR_samples = arg.samples(population=1)\n",
+ "\n",
+ "print(f\"Average Fst = {arg.Fst([AFR_samples, EUR_samples])}\")\n",
+ "\n",
+ "# Now show this in windows\n",
+ "genome_windows = [0, 250, 500, 750, 1000]\n",
+ "windowed_diversity = arg.diversity(windows=genome_windows)\n",
+ "windowed_Fst = arg.Fst([AFR_samples, EUR_samples], windows=genome_windows)\n",
+ "\n",
+ "fig, (ax_top, ax_bottom) = plt.subplots(2, 1, sharex=True)\n",
+ "ax_top.stairs(windowed_diversity, genome_windows, baseline=None)\n",
+ "ax_top.set_ylabel(\"Genetic diversity\")\n",
+ "ax_top.set_title(f\"Genetic statistics in {len(genome_windows) - 1} windows along the genome\")\n",
+ "ax_bottom.stairs(windowed_Fst, genome_windows, baseline=None)\n",
+ "ax_bottom.set_ylabel(\"AFR/EUR Fst\")\n",
+ "ax_bottom.set_xlabel(\"Genome position\")\n",
+ "\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53770b27-71d1-4f7f-8f36-9b06711c0634",
+ "metadata": {},
+ "source": [
+ "### Do you need genetic variation?\n",
+ "\n",
+ "It is worth noting that if you have the genealogy, then an alternative set of statistics exist which can be less noisy than those based on the raw genetic variation data. See https://tskit.dev/tutorials/no_mutations.html for more details.\n",
+ "\n",
+ "These \"branch\" mode statistics do not require mutations, and so do not contain noise associated with the mutational process. For instance, as our ARG was created under a simple neutral model, we would expect the genetic statistics along the genome to be constant. And indeed, using `mode=\"branch\"` results in a comparable windowed genome plot, in which the genetic statistics are less noisy than those based on the genetic variation created by mutations."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d00b5c27-ecb2-4c87-ad07-2f8a607dca96",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "branch_Fst = arg.Fst([AFR_samples, EUR_samples], windows=genome_windows, mode=\"branch\")\n",
+ "branch_diversity = arg.diversity(windows=genome_windows, mode=\"branch\")\n",
+ "scaled_branch_diversity = branch_diversity * mutation_rate\n",
+ "\n",
+ "\n",
+ "fig, (ax_top, ax_bottom) = plt.subplots(2, 1, sharex=True)\n",
+ "ax_top.stairs(windowed_diversity, genome_windows, baseline=None, ls=\":\", label=\"site (mutation)-based\")\n",
+ "ax_top.stairs(scaled_branch_diversity, genome_windows, baseline=None, label=\"branch-based\")\n",
+ "ax_top.set_ylabel(\"Genetic diversity\")\n",
+ "ax_top.set_title(f\"Genetic statistics in {len(genome_windows) - 1} windows along the genome\")\n",
+ "ax_bottom.stairs(windowed_Fst, genome_windows, baseline=None, ls=\":\", label=\"site (mutation)-based\")\n",
+ "ax_bottom.stairs(branch_Fst, genome_windows, baseline=None, label=\"branch-based\")\n",
+ "ax_bottom.text(0, 0.13, f'Average branch Fst: {arg.Fst([AFR_samples, EUR_samples], mode=\"branch\"):.4f}')\n",
+ "ax_bottom.set_ylabel(\"AFR/EUR Fst\")\n",
+ "\n",
+ "ax_bottom.set_xlabel(\"Genome position\")\n",
+ "ax_top.legend()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55e337a5-2a8a-42fc-ab2c-4185ffcb6ddd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook1.url + \"Q11.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "23c37c45-ecbe-4db3-8a93-7d7d86d58c32",
+ "metadata": {},
+ "source": [
+ "We will encounter other summary statistics in future workshops"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7fd19d18-0f30-4acb-b225-9834e1e7a09e",
+ "metadata": {},
+ "source": [
+ "## Extra *tskit* details\n",
+ "\n",
+ "This section aims to give you a deeper understanding of how ARGs are stored within _tskit_, and how to access the raw data underlying a _tskit_ ARG. It is deliberately optional in the sense that there are no exercises or questions, but it should help you in your future use of the _tskit_ library.\n",
+ "\n",
+ "### Tables\n",
+ "\n",
+ "Nodes, edges, individuals, populations, sites, and mutations are all stored as *rows* in a set of separate *tskit* **tables**. Together these constitute a [TableCollection](https://tskit.dev//tables_and_editing.html). You can see a summary of the tables by simply displaying a tree sequence in a notebook: this describes the number of rows in each table, and the size of each table. Below you will see that our `arg` has 42 edges, 37 nodes, and 13 mutations at 12 different sites along the genome:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "247cf3af-26d7-4e7b-bea5-6da4495b5a6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display(arg)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71194ca9-1a67-48e3-bed9-e061a6321c3c",
+ "metadata": {},
+ "source": [
+ "Each of the tables can also be displayed separately, using `arg.tables.nodes`, `arg.tables.edges`, etc. Printing a table or the whole table collection creates a plain-text representation (helpful e.g. if you are not using a notebook):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3453036-fcad-49b3-ba6f-6cec24f08f9e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.tables.edges) # or display(arg.tables.edges) for a prettier HTML version"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81f33a8f-e58a-444b-ae12-37c92dcd9793",
+ "metadata": {},
+ "source": [
+ "It should be reasonably obvious how this works. E.g. edge 0 connects parent node 10 to child node 6 in the part of the genome that spans 0 to 930 bp. For further information see [https://tskit.dev/tskit/docs/stable/data-model.html](https://tskit.dev/tskit/docs/stable/data-model.html), and for a tutorial approach, see [https://tskit.dev/tutorials/tables_and_editing.html](https://tskit.dev/tutorials/tables_and_editing.html).\n",
+ "\n",
+ "As a brief introduction, you can access particular edges, nodes, sites, etc. as Python objects using `arg.edge(i)`, `arg.node(i)`, `arg.site(i)`, and so on."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d0364b9-4d0c-4be7-886f-19f90d71156a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.edge(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5aa40b3-9385-415e-8b53-1dbe9d2cf8bd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.node(10))\n",
+ "print(arg.node(6))\n",
+ "individual_id_for_node_6 = arg.node(6).individual"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79110ae0-3777-480f-a10a-33bf7687f613",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.individual(individual_id_for_node_6))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63667e7f-4439-42eb-ac01-04b7dd2418f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.mutation(0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ef3ca4b7-c86d-4c75-9313-2ee93a4f9f32",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(arg.site(0))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bca42060-a2fb-45b4-9175-ea63560300c8",
+ "metadata": {},
+ "source": [
+ "#### High performance data access\n",
+ "\n",
+ "However, the most performant way to access the underlying data is to use the [efficient column accessors](https://tskit.dev/tskit/docs/stable/python-api.html#efficient-table-column-access), which provide _numpy_ arrays that are a direct view into memory. For example, to find all the site positions along the genome, you can use `arg.tables.sites.position` (or the shortcut `arg.sites_position`). This is particularly relevant when dealing with ARGs containing large tables (e.g. millions of rows)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "35f24619-da2f-407f-b724-584c9439a1c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "arg.tables.sites.position"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec64bfaa-0af3-411b-a2c9-42173a9386b9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "arg.tables.edges.parent # Also available via the shorthand arg.edges_parent"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6dfdf05e-bc7b-4316-9ed4-60404cb5f8bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "arg.tables.edges.child # Also available via the shorthand arg.edges_child"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7ba5e13f-3a1d-454a-997a-2fc871191fda",
+ "metadata": {},
+ "source": [
+ "and so on. These can be used with _numpy_, the numerical Python library:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "caf9f270-9eee-47b6-841e-98a3b7606981",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "# find the the mean of the base 10 log of the times of all mutation in the arg\n",
+ "np.mean(np.log10(arg.tables.mutations.time))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f166586-db39-4281-bd12-8fed99dfe711",
+ "metadata": {},
+ "source": [
+ "#### High performance trees\n",
+ "\n",
+ "There are also [fast array access methods](https://tskit.dev/tskit/docs/stable/python-api.html#array-access) for local trees in a tree sequence. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1b41204d-9b9f-4bc6-b80a-fe13a44375fb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree = arg.first()\n",
+ "\n",
+ "# Simple access to the parent of node 0 in the tree\n",
+ "print(\"Parent of node 0 in the first tree is\", tree.parent(0))\n",
+ "\n",
+ "# High performance tree array access\n",
+ "parents = tree.parent_array # The parent ID of each node in this tree, or tskit.NULL (-1) if the node is not in the tree, or has no parent\n",
+ "print(\"\\nParents of all the nodes:\\n\", parents)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c15973a2-754a-4404-8515-acfb5a5ffdad",
+ "metadata": {},
+ "source": [
+ "The `parent_array` is a direct view into memory, which only needs slightly adjusting to encode the parents in the next tree. For large ARGs, this is much more efficient than making a new array for each local tree:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4155fb3-2278-4b9d-9857-3f76aa4c8017",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tree.next()\n",
+ "print(\"Parents of all the nodes in the next tree:\\n\", parents) # will be slightly different from the previous time we viewed this array"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/content/WhatIsAnARG_workbook2.ipynb b/content/WhatIsAnARG_workbook2.ipynb
new file mode 100644
index 0000000..c2665cb
--- /dev/null
+++ b/content/WhatIsAnARG_workbook2.ipynb
@@ -0,0 +1,1483 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9baa2d4a-8100-462a-942f-e1f09a0ed419",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " import micropip\n",
+ " await micropip.install('tszip')\n",
+ " await micropip.install('pyslim==1.0.4') # Need an older version for SLiM < 5\n",
+ " await micropip.install('stdpopsim')\n",
+ " await micropip.install('demesdraw')\n",
+ " await micropip.install('jupyterquiz')\n",
+ " await micropip.install('tskit_arg_visualizer')\n",
+ "except(ModuleNotFoundError):\n",
+ " pass # allow to be run outside of JupyterLite too\n",
+ "\n",
+ "from jupyterquiz import display_quiz\n",
+ "from matplotlib import pyplot as plt\n",
+ "from WhatIsAnARG_module import Workbook2\n",
+ "\n",
+ "Workbook2.setup()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "16853d10-0fc6-45e5-9ca5-0744a091c38a",
+ "metadata": {},
+ "source": [
+ "### Overview\n",
+ "This is the first of 2 workbooks: each is intended to take about 1½-2 hours to complete.\n",
+ "\n",
+ "\n",
+ "- Workbook 1
\n",
+ " - Graphs and local trees, mutations, genetic variation and statistical calculations: exercises A-J & questions 1-11
\n",
+ "- Workbook 2
\n",
+ " - Types of ARG, simplification, and ARG simulation: exercises K-U & questions 12-30
\n",
+ "
\n",
+ "\n",
+ "# Workbook 2: ARG types and simulation\n",
+ "\n",
+ "### Recap\n",
+ "\n",
+ "ARGs can be saved in tree sequence format, as a collection of **nodes** and **edges**. This format can also contain **individuals** (simply used to group nodes together), and if genetic variation is to be encoded, **mutations** associated with **sites**.\n",
+ "\n",
+ "Such ARGs can be analysed using the _tskit_ library, and drawn tree-by-tree using the built-in `.draw_svg()` method, or as a graph by using the [tskit_arg_visualizer](https://github.com/kitchensjn/tskit_arg_visualizer/blob/main/docs/tutorial.md) package, based on the D3js interactive visualization library. We have been using a so-called *full ARG* stored in the `example_ARG.trees` file.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58e554da-ab32-4d3e-b7a8-93da8d83bd31",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tskit\n",
+ "import tskit_arg_visualizer as argviz\n",
+ "from msprime import NODE_IS_RE_EVENT\n",
+ "from tskit import NODE_IS_SAMPLE\n",
+ "\n",
+ "arg = tskit.load(\"data/example_ARG.trees\")\n",
+ "d3arg = argviz.D3ARG.from_ts(arg)\n",
+ "d3arg.set_node_styles({u: {\"symbol\": \"d3.symbolSquare\"} for u in arg.samples()})\n",
+ "d3arg.set_node_styles({\n",
+ " u: {\"fill\": \"red\" if flags & NODE_IS_RE_EVENT else \"grey\" if flags & NODE_IS_SAMPLE else \"blue\"}\n",
+ " for u, flags in enumerate(arg.nodes_flags)\n",
+ "})\n",
+ "d3arg.draw()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c72a8430-954f-408b-874f-d3763eb0da1b",
+ "metadata": {},
+ "source": [
+ "## Full ARGs: perfect historical knowledge\n",
+ "\n",
+ "The full ARG above shows \"perfect knowledge\". Specifically, it records the exact times of two sorts of historical event:\n",
+ "* Common ancester (CA) events, caused by two genomes duplicating during a round of mitosis. These are represented by a node with two children.\n",
+ "* Recombination (RE) events, caused by two genomes combining during a round of meiosis. These are represented by a node with two parents (as we saw, the _msprime_ simulator actually encodes these as a pair of nodes, but we visualise this pair as a single node).\n",
+ "\n",
+ "However, requiring perfect knowledge can be too restrictive, especially when representing ARGs which have been inferred, imperfectly, from real data. In the next section we explore ARG representations which omit unknowable or undetectable nodes and edges. These *simplified ARGs* retain essentially the same genetic ancestry as the equivalent full ARGs, but can be substantially smaller. This means they are easier to visualise, analyse, infer, and simulate.\n",
+ "\n",
+ "### Unknowable nodes\n",
+ "\n",
+ "The easiest example of an unknowable structure in a full ARG is a diamond. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4bf18db7-30f3-4f83-adfd-a21770e909c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q12.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc7ff115-2c52-46a1-bfb5-710b850c8025",
+ "metadata": {},
+ "source": [
+ "We previously saw that a recombination event within a diamond does not cause a change in topology or branch lengths of the local trees. In fact, nowhere on the ARG can we place a mutation such that this diamond is detectable, because none of the nodes in a diamond help to group samples together. More specifically, none of the diamond nodes are *coalescent* in any of the local trees.\n",
+ "\n",
+ "As a rule, we can treat the times of all non-coalescent nodes as unknowable. This includes all RE nodes, but also some CA nodes such at the top of diamonds. To find these in our ARG, we can use the `Workbook.node_coalescence_status()` function from the first workbook. We will wrap this in a function called `create_styled_D3ARG()` that creates and colours the ARG as appropriate (you don't need to understand the workings of this function, however).\n",
+ "\n",
+ "Note: Later on we shall start removing nodes from the ARG, which could change the node IDs. For this reason we will switch to labelling the nodes using metadata. Rather unimaginatively, in their metadata, the sample nodes have assigned the names A-J, which will be maintained even if their IDs change. In real data, metadata labels should be more meaningful, e.g. {'name':'HG00157', 'sex':'Male'} for an individual human from the 1000 Genomes Project.\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c2bf5fb-86e0-4ff8-8ae9-7d4c56448c91",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import os\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "def metadata_based_labels(arg, use_ids=True):\n",
+ " if use_ids:\n",
+ " return {nd.id: (nd.metadata or {}).get(\"label\", nd.id) for nd in arg.nodes()}\n",
+ " else:\n",
+ " # only make keys in the dictionary if there is a \"label\"\n",
+ " return {nd.id: nd.metadata[\"label\"] for nd in arg.nodes() if nd.metadata and \"label\" in nd.metadata}\n",
+ " \n",
+ "\n",
+ "def create_styled_D3ARG(arg, x_pos_file=None, node_map=None):\n",
+ " \"\"\"\n",
+ " You don't need to understand this function!\n",
+ "\n",
+ " Take a tskit ARG and return a D3ARG, with coloured nodes and with sample nodes\n",
+ " labelled with node.metadata[\"label\"]. If an x_pos_file is given, load X positions\n",
+ " from this file, translating the node IDs via the node_map if provided\n",
+ " \"\"\"\n",
+ " d3arg = argviz.D3ARG.from_ts(arg)\n",
+ " status = np.array(['cyan', 'blue', 'green'])[Workbook2.node_coalescence_status(arg)]\n",
+ " status[arg.samples()] = 'gray'\n",
+ " status[(arg.nodes_flags & NODE_IS_RE_EVENT) != 0] = 'red'\n",
+ " d3arg.set_node_styles({u: {\"fill\": str(colour)} for u, colour in enumerate(status)})\n",
+ " d3arg.set_node_styles({u: {\"symbol\": \"d3.symbolSquare\"} for u in arg.samples()})\n",
+ " d3arg.set_node_labels(metadata_based_labels(arg, use_ids=False))\n",
+ "\n",
+ " if x_pos_file is not None:\n",
+ " x_pos = argviz.extract_x_positions_from_json(json.loads(Path(x_pos_file).read_text()))\n",
+ " if node_map is None:\n",
+ " d3arg.set_node_x_positions(x_pos)\n",
+ " else:\n",
+ " d3arg.set_node_x_positions({node_map[u]: x for u, x in x_pos.items() if node_map[u] != tskit.NULL})\n",
+ "\n",
+ " return d3arg\n",
+ "\n",
+ "d3arg = create_styled_D3ARG(arg, x_pos_file=\"data/Xpos.json\")\n",
+ "d3arg.draw(height=500)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a8e30454-bf7c-496d-a25b-b5a8a01f1ff3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q13.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c36dce03-ee1d-4d31-92a4-2f8973f9bf57",
+ "metadata": {},
+ "source": [
+ "## Simplified ARGs: inheritance between genomes\n",
+ "\n",
+ "It is possible to construct ARGs without unknowable nodes. Instead of being associated with explicit events, nodes in such an ARG are better imagined as *genomes*, between which genetic information is inherited. These *simplified* ARGs can be produced directly from simulation or inferred from real data. Here, however, we will explore how they relate to full ARGs, via the process of *simplification*.\n",
+ "\n",
+ "### Simplification\n",
+ "\n",
+ "Simplification is the act of removing redundant information from an ARG. The _tskit_ library provides a flexible [`.simplify()`](https://tskit.dev/tutorials/simplification.html) method that performs this task. The core idea is to retain only the ancestry of a specified set of \"focal nodes\". Nodes that are not needed to represent this ancestry are removed.\n",
+ "\n",
+ "\n",
+ "#### Partial simplification\n",
+ "\n",
+ "The code below carries out a \"partial simplification\" of the ARG, detaching all unknowable (non-coalescent) nodes from the ancestry (although we use `filter_nodes=False` to keep them in the stored object).\n",
+ "\n",
+ "Hover over the genome bar to check that a complete ancestry of the samples is still present, even though various intermediate nodes have been removed:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "748f8f4e-385b-4122-b6f1-95de6978aed7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def simplify_non_coalescent_nodes(arg, **kwargs):\n",
+ " # NB: this function can probably be removed when https://github.com/tskit-dev/tskit/issues/2127 is fixed\n",
+ " partial_or_always_coalescent_nodes = np.where(Workbook2.node_coalescence_status(arg) > 0)[0]\n",
+ " focal_nodes = np.concatenate((arg.samples(), partial_or_always_coalescent_nodes))\n",
+ " return arg.simplify(\n",
+ " focal_nodes, # The set of nodes whose ancestry we want to keep\n",
+ " update_sample_flags=False, # Usually all focal nodes are turned into samples, but we don't want that here\n",
+ " **kwargs, # pass on any additional parameters (e.g. \"filter_nodes\") to `simplify`\n",
+ " )\n",
+ "\n",
+ "part_simp = simplify_non_coalescent_nodes(arg, filter_nodes=False)\n",
+ "\n",
+ "create_styled_D3ARG(\n",
+ " part_simp,\n",
+ " x_pos_file=\"data/Xpos.json\"\n",
+ ").draw(height=400)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a18cc2a-7d19-4aef-bfe5-8fa1b894e97c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q14.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a116dc8-4ba6-45f3-832a-dfa30cac3420",
+ "metadata": {},
+ "source": [
+ "By specifying `filter_nodes=True` (which is the default) we can actually delete the unused nodes completely. However, this will change the node IDs. So that we can plot the nodes in the right place, we'll also return the `node_map` of old to new IDs when simplifying."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b0b800a-9d3d-4417-ba51-268880023af6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "part_simplified_arg, node_map = simplify_non_coalescent_nodes(arg, filter_nodes=True, map_nodes=True)\n",
+ "create_styled_D3ARG(\n",
+ " part_simplified_arg,\n",
+ " x_pos_file=\"data/Xpos.json\",\n",
+ " node_map=node_map\n",
+ ").draw(height=400)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "84fd3cdf-50c1-471a-81bd-031d18b0560a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q15.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "873482c4-8e5b-4c00-a120-30c3ceb239c5",
+ "metadata": {},
+ "source": [
+ "Here's what the partially simplified trees look like. As you can see, they still look slightly unusual compared to a standard \"gene tree\", as there are a few locally unary nodes left. These nodes are coalescent in some of the local trees, but not in others. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c25cff93-197d-4cf3-ad33-1c304ff88432",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q16.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "eee18b2e-b0e5-468c-b721-89a45d915682",
+ "metadata": {},
+ "source": [
+ "- Exercise K
\n",
+ " - Use
part_simplified_arg.draw_svg() to plot the local trees.\n",
+ " If you change the height of the plot using size=(1000, 400) the tree will be clearer.\n",
+ " You might also want to specify node_labels=metadata_based_labels(part_simplified_arg) to get nicer labels.\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ad832b00-daa8-4161-be65-91170c74eb51",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise K here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fc4f4115-f505-4cfe-8232-fa2a15e678af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q17.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "43cdacbf-499f-4e80-b478-9090854b1e38",
+ "metadata": {},
+ "source": [
+ "## Full simplification\n",
+ "\n",
+ "We can go one stage further by fully simplifying so that ancestral nodes are all-coalescent. This modifies edges so that, for example, the edge above node 18 in the first tree goes straight to node 20, rather than going via node 19. This will not change the general topology or branch lengths of the trees, and it also makes the trees simpler. However, it can create more edges (making ARG analysis less efficient), and slightly paradoxically, makes the graph visualization *less* simple:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75144e43-5c45-4dbb-bd69-b1c7050faf02",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fully_simplified_arg, node_map = arg.simplify(map_nodes=True)\n",
+ "\n",
+ "display(fully_simplified_arg.draw_svg(\n",
+ " size=(1000, 400),\n",
+ " node_labels=metadata_based_labels(fully_simplified_arg),\n",
+ " title=\"Fully simplified local trees\",\n",
+ "))\n",
+ "\n",
+ "create_styled_D3ARG(\n",
+ " fully_simplified_arg,\n",
+ " \"data/Xpos.json\",\n",
+ " node_map).draw(height=500, title=\"Fully simplified ARG\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27e9c261-00c8-433a-95d2-fca29418b9a4",
+ "metadata": {},
+ "source": [
+ "As you can see above, the local trees look simpler, but the graph looks more tangled (although every node is now all-coalescent). The fully-simplified ARG encodes exactly the same genetic information on the samples, and the same branch lengths in the local trees, but is less genealogically-informative about how adjacent trees relate to each other."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "209271ab-29fd-40ec-9665-d49957663edf",
+ "metadata": {},
+ "source": [
+ "- Exercise L
\n",
+ " - use the
.num_edges, .num_nodes, .num_trees, and .num_mutations attributes to print out the number of edges, nodes, trees, and mutations in the part_simplified_arg ARG and the fully_simplified_arg ARG (or you could simply display the summary table of each ARG). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7697e5e7-e844-4081-ac32-eb46f23fe535",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise L here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc3f545b-0403-4e13-a76e-13b4ea9467c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q18.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "81d75407-379b-49e3-9946-4dd7edede763",
+ "metadata": {},
+ "source": [
+ "Because the number of sites and mutations has remained the same, and there is still a complete ancestry for all the samples, the encoded genetic variation and even branch-length calculations are the same between the full ARG and any of the simplified versions. This is easy to demonstrate:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c58479ed-1257-41ed-8567-f47a785d1414",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\" ====== Original (branch π: {arg.diversity(mode='branch'):.2f}) ======== \",\n",
+ " f\" === Fully simplified (branch π: {fully_simplified_arg.diversity(mode='branch'):.2f}) ===\")\n",
+ "for v1, v2 in zip(arg.variants(), fully_simplified_arg.variants()):\n",
+ " print(\n",
+ " f\"pos {int(v1.site.position):>3}\", v1.states(), \" \",\n",
+ " f\"pos {int(v2.site.position):>3}\", v2.states()\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6fa60081-b71d-44ed-b256-6691b4ac20b5",
+ "metadata": {},
+ "source": [
+ "For historical reasons, the fully simplified format is the default output of the _msprime_ and _SLiM_ simulators, even though in some cases it can be slightly less efficient."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c9e6b0f9-9314-4c64-8983-4e1397ef4076",
+ "metadata": {},
+ "source": [
+ "### Nodes represent genomes\n",
+ "\n",
+ "Philosophically, the nodes in a simplified ARG no longer represent historical events, but the genomes that are produced as an *outcome* of those (unknown) events. For example, sample node 6 now has 2 parents, even though it does not itself represent a recombination event. We know a recombination event occured some time between the time of node 6 and the time of its youngest parent, node 15, but we can't say exactly when.\n",
+ "\n",
+ "We can also use the `simplify()` method to identify the genome sequence at internal nodes:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dc6144fb-55ef-4505-8acb-fe18716f4d16",
+ "metadata": {},
+ "source": [
+ "### Simplifying with different focal nodes\n",
+ "\n",
+ "One of the main uses of `simplify()` is to change which nodes are treated as samples. By default the existing sample nodes are taken, but we can This allows us to look at the genome of internal nodes in the ancestry, and emphasises that each node is a (possibly partially known) genome. For instance "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a059faf0-d633-4b07-83f4-07a017200fa0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample_ids_plus_node_21 = np.append(part_simplified_arg.samples(), 21) # make a list of sample nodes and add node 21\n",
+ "new_arg = part_simplified_arg.simplify(sample_ids_plus_node_21) # pretend that we have \"sampled\" node 21\n",
+ "print(\"Known variation for ancestral node 21\")\n",
+ "for v in new_arg.variants():\n",
+ " print(f\"pos {int(v.site.position):>3}: \", v.states(missing_data_string=\"-\")[new_arg.num_samples - 1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fac20aa-c9d7-42b2-8459-3c3b6cabe2a1",
+ "metadata": {},
+ "source": [
+ "You can see that the start and end of the genome of node 21 is unknown (its haplotype is truncated)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13ec892f-5fae-4ef4-aedc-01d55c554dff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q19.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7d5a5593-340a-4822-960a-ecff62e3ce08",
+ "metadata": {},
+ "source": [
+ "#### Simplifying to reduce the sample size\n",
+ "\n",
+ "If we choose a subset of the existing samples, we can reduce the ARG to reflect the ancestry of only those samples. Repeated simplification to the current-day samples is a key component of forward simulation."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a7038c8-2cc4-4a3e-9c98-b20b97aa1b70",
+ "metadata": {},
+ "source": [
+ "- Exercise M
\n",
+ " - Simplify the original ARG so that it only shows the ARG relating sample 0 (A) to sample 6 (F). By default
simplify() removes any sites that no longer have any mutations (are \"monomorphic\"). Set filter_sites=False to leave those sites in the resulting tree sequence. Sites with no mutations will have a tickmark on the x-axis without an associated red v-shaped mutation mark.
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aae265ad-0688-4252-aaf1-74b9adb6ed8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Carry out exercise M here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b1155316-65f8-44b9-9e9c-0aa1c1d80c0a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q20.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef7106e2-4eee-4056-b2a7-225fc7674fd7",
+ "metadata": {},
+ "source": [
+ "### The simplest possible ARG\n",
+ "\n",
+ "We can even go to the extreme, and simplify an ARG to only 2 genomes (say 0 and 9)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7b8e41a5-8014-4457-a324-866facb83fb2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "arg.simplify([0,9], filter_sites=False).draw_svg()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "19ce95d4-9b66-4833-9d76-576b1d8d8710",
+ "metadata": {},
+ "source": [
+ "Although there are only 2 samples (and 3 trees) in this ARG, and the topology of the local trees is always a \"cherry\", there is still some ARG structure, because the time to the MRCA differs as we go along the genome. The ability to infer a simple ARG of this sort can still be extremely powerful: this is the basis behind techniques based on the Pairwise Sequentially Markovian Coalescent (PSMC) (see e.g. [this 2019 review](https://doi.org/10.1002/ece3.5888) or more recent papers by [Schweiger (2023)](https://doi.org/10.1101/gr.277665.123) and [Terhorst (2025)](https://doi.org/10.1038/s41588-025-02323-x))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab50db90-3023-4d76-8258-eeaab1566bcc",
+ "metadata": {},
+ "source": [
+ "## A note on undetectable nodes\n",
+ "\n",
+ "Some ARG nodes may be theoretically knowable, but mutations do not provide enough information to detect them. If we are to infer ARGs from data, we can treat unknowable events like this in two ways\n",
+ "\n",
+ "1. Infer a large set of \"representative\" ARGs, by placing unknowable events randomly according to some model (e.g. SINGER) or calculating summaries that integrate across them (e.g. PSMC)\n",
+ "2. Infer a single ARG that simply omits the unknowable events (e.g. tsinfer).\n",
+ "\n",
+ "We will largely use the second approach, as we can then construct a single representative ARG (rather than a e.g. a set of ARGs in which coalescent orders have been randomly resolved). The idea is a bit like collapsing coalescences, as we did above when simplifying the immediate descendants of the root. However, in this case it creates *polytomies* (nodes in a local tree with more than 2 children)\n",
+ "\n",
+ "Importantly, this will change the branch lengths in the local trees, and hence affect branch-length measures of genetic variation. More generally, it can cause problems when measuring features of ARGs, or performing statistical tests, which is why approach (1) may work better in many cases."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89bdaed2-69bf-4a0d-9d5b-5ae5a9835026",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Workbook2.remove_unsupported_edges(arg).draw_svg(\n",
+ " title=\"ARG, collapsing edges unsupported by mutations\",\n",
+ " size=(1000, 400)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5133fe7a-2bc3-4e48-9c4c-f4b5e34f4a98",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q21.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb1c109d-34eb-4c46-978c-d8101b1e7d67",
+ "metadata": {},
+ "source": [
+ "## Simulating ARGs\n",
+ "\n",
+ "There are two basic approaches for simulating ancestry: forward and backward in time. Backward-time simulators such as _msprime_ are usually much faster but less flexible (for instance, it is hard to simulate anything but the simplest form of natural selection). Forward-time simulators such as _SLiM_ are more general, but must track the entire population during simulation, rather than simply following the ancestry of a small set of sample genomes.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6780edff-07bf-454d-b84d-583a81956d67",
+ "metadata": {},
+ "source": [
+ "- Exercise N
\n",
+ " - Tree sequence files often have \"provenance\" data,\n",
+ " which decribes how the file was made. You can see this when you display a\n",
+ "tree sequence to the screen in a notebook: the list of provenances appears the bottom of the output.\n",
+ "Use
display(arg) (or simply arg) to show the ARG summary in the cell below. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "698273b7-dcd5-49b3-85cb-7267127d3540",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Carry out exercise N, displaying a summary of the ARG, here\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90cba45d-cd10-4d8a-924b-d4ea93596ecb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q22.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "de9b5ff2-b945-4224-99a8-86fc3ef8968e",
+ "metadata": {},
+ "source": [
+ "### _Msprime_: a backward-time simulator\n",
+ "\n",
+ "_Msprime_ is a fast and flexible backward-time simulator whose core functions are `sim_ancestry` and `sim_mutations`. The `sim_ancestry` command is run first and this creates the ARG structure.\n",
+ "\n",
+ "Running an _msprime_ simulation in Python is extremely easy. You simply need to call `sim_ancestry` with a number of (assumed diploid) samples. You'll usually want to provide a `sequence_length`, a `population_size` and a `recombination_rate` too:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f8508618-ca14-4f4b-a1ca-deeb07fad2ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Name the ARG `ts` (for \"tree sequence\")\n",
+ "import msprime\n",
+ "\n",
+ "ts = msprime.sim_ancestry(10, population_size=1e4, sequence_length=10_000, recombination_rate=2e-8)\n",
+ "print(f\"Simulated a simplified ARG of {ts.num_samples} haploid genomes\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a01af82a-62b6-40c6-87ce-19c3ccd80311",
+ "metadata": {},
+ "source": [
+ "The command above generates a \"fully simplifed\" ARG of 10 diploid samples (20 haploids) over a 10kB genome, assuming a constant effective population size ($N_e$) of 10,000, and with a recombination rate of $2\\times10^{-8}$ crossover mutations per base pair per generation. As the resulting ARG is stored in tree sequence format, we often name the resulting simulation `ts`. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "174c034b-1350-4d6f-8fc3-e2f7c7582732",
+ "metadata": {},
+ "source": [
+ "#### Demographic models\n",
+ "\n",
+ "Of course, most populations have not stayed at a constant size over time, and are often modelled as multiple subpopulations. Instead of providing a `population_size` parameter, you can provide _msprime_ with a **demographic model**. This is how we simulated the original full ARG (for the curious, the details are in the `sim_ancestry` provenance entry, under \"parameters\" -> \"demography\"). The following code reads the demographic model from the full ARG's provenance, converts it to the portable [demes](https://popsim-consortium.github.io/demes-spec-docs/main/introduction.html) format, and plots it using the elegant [_demesdraw_](https://grahamgower.github.io/demesdraw/latest/) software:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "107b6596-5083-4d52-a301-0198c833bbb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import demesdraw\n",
+ "\n",
+ "try:\n",
+ " # This requires msprime >= 1.3.5, not yet on pyodide\n",
+ " cmd, parameters = msprime.provenance.parse_provenance(arg.provenance(0), arg)\n",
+ " assert cmd == \"sim_ancestry\" # just check we have the right (zeroth) provenance entry\n",
+ "except ValueError:\n",
+ " # Hack the workbook for older msprime versions\n",
+ " import pickle\n",
+ " with open(\"data/msprime_param_hack.pkl\", 'rb') as picklefile:\n",
+ " parameters = pickle.load(picklefile)\n",
+ "\n",
+ "msprime_demography_object = parameters[\"demography\"]\n",
+ "demesdraw.tubes(msprime_demography_object.to_demes(), log_time=False);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56a4d653-3382-45bd-b8a2-d5a56f5d6ff3",
+ "metadata": {},
+ "source": [
+ "- Exercise O
\n",
+ " - The widths on the X axis, indicating the two population sizes, is dominated by a rapid exponential increase in the past few hundred generations. Change
log_time from True to False above, to zoom in on more recent times, and re-plot. \n",
+ "
\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "55daca94-bafd-4e36-a90d-c06647a256ad",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q23.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "163dda08-6ac4-4187-877f-bcd2ad4dc9ac",
+ "metadata": {},
+ "source": [
+ "### Running a larger simulation\n",
+ "\n",
+ "We will rerun under the same demography, but simulate twenty times more of the genome (20kb). As we did in the original simulation, we will used the `record_full_arg` option to simulate a full ARG, rather than the simplified ARG that _msprime_ produces by default. Because there are two populations, we will 5 samples from each (for a total of 10 diploid samples), and use the original simulation recombination rate of $1.15\\times10^{-8}$. We'll use a fixed `random_seed` to ensure that everyone gets the same results:\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "93abb800-3335-4692-94e5-bad0bc8cd6ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datetime import datetime\n",
+ "\n",
+ "n_diploids=10\n",
+ "genome_length=20_000 # bp\n",
+ "\n",
+ "start_time = datetime.now()\n",
+ "larger_arg = msprime.sim_ancestry(\n",
+ " samples={\"AFR\": n_diploids/2, \"EUR\": n_diploids/2},\n",
+ " recombination_rate=1.15e-08,\n",
+ " sequence_length=genome_length,\n",
+ " demography=msprime_demography_object,\n",
+ " record_full_arg=True,\n",
+ " random_seed=321\n",
+ ")\n",
+ "\n",
+ "print(f\"Simulated a full ARG of {larger_arg.num_samples} haploid samples over \"\n",
+ " f\"{larger_arg.sequence_length/1e3}kb in {datetime.now()-start_time} seconds\")\n",
+ "print(\n",
+ " f\"ARG takes up {larger_arg.nbytes * 1e-6:.2f} MB, with {larger_arg.num_nodes} nodes \"\n",
+ " f\"and {larger_arg.num_edges} edges, encoding {larger_arg.num_trees} trees\\n\")\n",
+ "\n",
+ "# Calculate the number of \"unknowable\" nodes\n",
+ "non_coal = Workbook2.node_coalescence_status(larger_arg) == 0\n",
+ "p = (sum(non_coal)-larger_arg.num_samples)/(len(non_coal) - larger_arg.num_samples)\n",
+ "print(f'{p * 100:.2f}% of nodes in this full ARG are non-coalescent (\"unknowable\")')\n",
+ "print(f'{np.sum(larger_arg.nodes_flags & msprime.NODE_IS_RE_EVENT > 0)/ larger_arg.num_nodes * 100:.2f}% of nodes are RE nodes')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "45781815-cf97-4347-b89d-6ab593977d21",
+ "metadata": {},
+ "source": [
+ "You can see that this slightly larger ARG now has quite a high proportion of unknowable nodes.\n",
+ "Plotting this out as a complete ARG is possible, but maybe not very helpful (feel free to change the `edge_type` below to `\"ortho\"` if you think it will help):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "27958fcc-40f6-4f89-8116-b9ce72454009",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if larger_arg.num_edges > 10000:\n",
+ " raise RuntimeError(\"ARG too big to plot!!!\")\n",
+ "d3larger_arg = create_styled_D3ARG(larger_arg)\n",
+ "d3larger_arg.draw(\n",
+ " height=800, width=800,\n",
+ " title=f\"ARG of {larger_arg.num_trees} trees\",\n",
+ " y_axis_labels={0:0, 0.5e4:0.5e4, 1e4:1e4, 1.5e4:1.5e4, 2e4:2e4, 2.5e4:2.5e4, 3e4:3e4})\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d2ae0fb6-c2d2-447a-bb0d-e76d2ad11a94",
+ "metadata": {},
+ "source": [
+ "As you can see, this ARG is rather too large to visualise, although it can be shrunk considerably by removing the ~70% of non-coalescent nodes (most of which are recombination nodes)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e69a99ec-20ca-4fea-9176-dc6f9f994ead",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "partial_or_always_coalescent_nodes = np.where(Workbook2.node_coalescence_status(larger_arg) > 0)[0]\n",
+ "focal_nodes = np.concatenate((larger_arg.samples(), partial_or_always_coalescent_nodes))\n",
+ "part_simp_arg = larger_arg.simplify(\n",
+ " focal_nodes, # The set of nodes whose ancestry we want to keep\n",
+ " filter_nodes=False, # Omit nodes from the genealogy, but do not completely remove them in the returned object \n",
+ " update_sample_flags=False, # Usually all focal nodes are turned into samples, but we don't want that here\n",
+ ")\n",
+ "\n",
+ "d3part_simp_arg = create_styled_D3ARG(part_simp_arg)\n",
+ "d3part_simp_arg.draw(height=1000, width=800, title=\"Simplified larger ARG\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2fbc1668-f568-42e0-8243-3162c9afa842",
+ "metadata": {},
+ "source": [
+ "- Exercise P
\n",
+ " - Now repeat the msprime
larger_arg simulation above, including all the subsequent print statements in that cell, but edit the n_diploids value to simulate 500 diploids (1000 sample genomes) and the genome_length line to simulate 10 Mb of genome instead. Run the simulation (which could take a minute or two). BE CAREFUL NOT TO PLOT THE RESULTING ARG (that will probably make your computer run out of memory and/or crash the browser).
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a932d608-192a-49a9-acad-ca694cf024e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Use this cell for Exercise P, simulating 500 diploids over a 10Mb genome\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2609ee4a-7e3b-4137-a7e5-65edaac1b91f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q24.json\", shuffle_answers=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2f8fe98-56cd-4e59-be44-a93ce03a77c2",
+ "metadata": {},
+ "source": [
+ "Rather than go through the hassle a simulating a full ARG, we can simulate an already-simplified ARG. This is much more efficient:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7f169ff-fa07-4809-8d10-ee6099b719a3",
+ "metadata": {},
+ "source": [
+ "- Exercise Q
\n",
+ " - Go back to the simulation code you just ran, change the
record_full_arg parameter to False (or remove it entirely), and re-run the 1000 sample / 10Mb simulation. Rather than simulating the full ARG, this simulates the all-coalescent (\"fully simplified\") ARG, and so should be substantially faster.
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f560a698-a97d-44bf-b722-30beb2866526",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Use this cell for Exercise Q, simulating a 10Mb genome for 500 diploids but with record_full_arg=False\n",
+ "from datetime import datetime\n",
+ "\n",
+ "n_diploids=500\n",
+ "genome_length=10_000_000 # bp\n",
+ "\n",
+ "start_time = datetime.now()\n",
+ "larger_arg = msprime.sim_ancestry(\n",
+ " samples={\"AFR\": n_diploids/2, \"EUR\": n_diploids/2},\n",
+ " recombination_rate=1.15e-08,\n",
+ " sequence_length=genome_length,\n",
+ " demography=msprime_demography_object,\n",
+ " record_full_arg=True,\n",
+ " random_seed=321\n",
+ ")\n",
+ "\n",
+ "print(f\"Simulated a full ARG of {larger_arg.num_samples} haploid samples over \"\n",
+ " f\"{larger_arg.sequence_length/1e3}kb in {datetime.now()-start_time} seconds\")\n",
+ "print(\n",
+ " f\"ARG takes up {larger_arg.nbytes * 1e-6:.2f} MB, with {larger_arg.num_nodes} nodes \"\n",
+ " f\"and {larger_arg.num_edges} edges, encoding {larger_arg.num_trees} trees\\n\")\n",
+ "\n",
+ "# Calculate the number of \"unknowable\" nodes\n",
+ "non_coal = Workbook2.node_coalescence_status(larger_arg) == 0\n",
+ "p = (sum(non_coal)-larger_arg.num_samples)/(len(non_coal) - larger_arg.num_samples)\n",
+ "print(f'{p * 100:.2f}% of nodes in this full ARG are non-coalescent (\"unknowable\")')\n",
+ "print(f'{np.sum(larger_arg.nodes_flags & msprime.NODE_IS_RE_EVENT > 0)/ larger_arg.num_nodes * 100:.2f}% of nodes are RE nodes')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9839df54-2e63-4fa4-8252-43ebf352a87d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "part_simp_arg"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d25d03e2-ac74-44a1-acde-2968c34a241a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ss = part_simp_arg.simplify()\n",
+ "ss.extend_haplotypes()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22796e91-de6a-478a-ad34-94bd906954f3",
+ "metadata": {
+ "jupyter": {
+ "source_hidden": true
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q25.json\", shuffle_answers=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a117d284-c193-4ef4-8c50-1776f3c6387e",
+ "metadata": {},
+ "source": [
+ "The extend_haplotypes() method can infer missing non-coalescent segments of nodes, and place them back on the ARG. However, this can take some time for larger ARGs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47d9a12f-a0db-4d10-a959-185bc9bad4ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "start_time = datetime.now() # Could take a few minutes\n",
+ "part_coalescent_arg = larger_arg.extend_haplotypes()\n",
+ "print(\n",
+ " \"Added suggested non-coalescent regions in\",\n",
+ " (datetime.now()-start_time).seconds,\n",
+ " \"seconds\",\n",
+ ")\n",
+ "display(part_coalescent_arg)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "930d959e-4d74-49a8-9067-6804ff45cf26",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q26.json\", shuffle_answers=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6b701a2c-7320-4fac-b712-1f96bf48e848",
+ "metadata": {},
+ "source": [
+ "Although it can take some time, it may be worth creating this `part_coalescent_arg`, as a smaller ARG can make many genetic calculations more efficient, and span of ancestral haplotypes, relevant to e.g. the span of segments considered identical-by-descent, is likely to be more accurate.\n",
+ "\n",
+ "Alternatively, you can retain the exact non-coalescing spans of ancestral haplotypes during an _msprime_ simulation by using `coalescing_segments_only=False`. This is equivalent to partially simplifying the full ARG. However, this is not guaranteed to produce a smaller encoding (it retains extra unknowable information such as the genetic location of masked recombination events)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2bb113d6-dd5a-4e10-a820-ae2a09100bcd",
+ "metadata": {},
+ "source": [
+ "## Mutations\n",
+ "\n",
+ "We can add mutations *after* simulating ancestry using the [`msprime.sim_mutations()`](https://tskit.dev/msprime/docs/stable/mutations.html) function.\n",
+ "\n",
+ "It is valid to add mutations *after* simulating the ARG as long as they are neutral. One of the main reasons that ARGs are an efficient way to simulate genomes is because most mutations are neutral, and therefore do not need to be tracked during ancestry simulation.\n",
+ "\n",
+ "The default _msprime_ mutation model only adds single nucleotide changes, with equal probability of mutating between the 4 bases, but other sorts of mutation models [are available](https://tskit.dev/msprime/docs/stable/mutations.html#models)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a534ffb4-a119-4217-aaa6-32ae9d69c94e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if larger_arg.sequence_length != 10_000_000 or larger_arg.num_samples != 1000:\n",
+ " raise RuntimeError(\"You have not simulated the right length ARG: Make sure you run exercise Q correctly\")\n",
+ "mutated_large_arg = msprime.sim_mutations(larger_arg, rate=1e-8, random_seed=42)\n",
+ "print(f\"Simulated {mutated_large_arg.num_mutations} mutations at {mutated_large_arg.num_sites} sites\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "66741561-840a-466e-bb66-e09698c3ad47",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q27.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10d0f7da-db2a-4f0f-bbc8-f7bea2540a8a",
+ "metadata": {},
+ "source": [
+ "- Exercise R
\n",
+ " - \n",
+ " Get the times of all these mutations as a big array using
mutated_large_arg.mutations_time, take the log (base 10) of this using np.log10, then plot these logged times out usingplt.hist. You might want to also set nice tickmarks, e.g. using plt.xticks(np.arange(5), 10**np.arange(5)), and specify a label e.g. via plt.xlabel(f\"Mutation time ({mutated_large_arg.time_units})\")\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5dcc08c8-ceb3-4897-a38a-801ccf0d96dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot mutation times using this cell\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27adbb00-fd9d-4f71-85bd-bc9327e853f9",
+ "metadata": {},
+ "source": [
+ "## Stdpopsim: easily run verified simulations\n",
+ "\n",
+ "Even though the demographic model above is relatively simple, it still contains many parameters, and specifying it, or something similar, can be tricky and prone to error.\n",
+ "\n",
+ "
\n",
+ "For this reason, rather than using msprime directly, it is often much easier to use the Standard Library for Population Genetic Simulation Models (stdpopsim). This is a set of tried and tested genomic and demographic models for various species. We will demonstrate using the Python API (as documented in the tutorial documentation.\n",
+ "\n",
+ "For instance, here is an example of a demographic model of populations in the genus Pan (common chimpanzees and bonobos)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87b5cd4a-c132-4c73-bfc8-260774ed34e0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import stdpopsim\n",
+ "import demesdraw\n",
+ "species = stdpopsim.get_species(\"PanTro\") # /Pan troglodytes/\n",
+ "model = species.get_demographic_model(\"BonoboGhost_4K19\")\n",
+ "\n",
+ "print(\n",
+ " f'Chosen the demographic model \"{model.id}\" for `{species.name}` ({species.common_name}) '\n",
+ " f\"out of {len(species.demographic_models)} model(s) available:\")\n",
+ "for d in species.demographic_models:\n",
+ " print(f\"* {d.id}: {d.long_description}\")\n",
+ "\n",
+ "colours = {'central': 'tab:blue', 'bonobo': 'tab:orange', 'western': 'tab:green'}\n",
+ "demesdraw.tubes(model.model.to_demes(), colours=colours); # You can ignore the warnings here, see https://github.com/popsim-consortium/stdpopsim/issues/1734"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a51987fa-39ac-4657-8fd8-a33018ba308a",
+ "metadata": {},
+ "source": [
+ "And here's how to tell _stdpopsim_ to run an _msprime_ simulation of the model, e.g. the first 20 MB of [chromosome 2A](https://en.wikipedia.org/wiki/Chimpanzee_genome_project#Genes_of_the_chromosome_2_fusion_site), from 20 bonobos, 12 central chimpanzees, and 8 western chimpanzees (remember that these are diploid genomes, so that makes 20 haploid sample nodes in total). Mutations will be automatically laid onto the ARG using `msprime.sim_mutations()` (you'll notice that we provide a `mutation_rate` below)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "48eb3053-6160-4091-9da7-8f501b2a2901",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contig = species.get_contig(\"chr2A\", mutation_rate=model.mutation_rate, right=20e6) # only simulate 20 Mb\n",
+ "samples = {\"bonobo\": 20, \"central\": 12, \"western\": 8}\n",
+ "\n",
+ "engine = stdpopsim.get_engine(\"msprime\") # Use msprime as the underlying simulator\n",
+ "chimp_arg = engine.simulate(\n",
+ " model,\n",
+ " contig,\n",
+ " samples,\n",
+ " msprime_model=\"dtwf\",\n",
+ " msprime_change_model=[(20, \"hudson\")],\n",
+ " coalescing_segments_only=False,\n",
+ " random_seed=42,\n",
+ ")\n",
+ "print(f\"Simulated a chimp ARG of {chimp_arg.num_samples} samples with {chimp_arg.num_trees} local trees\")\n",
+ "chimp_arg = chimp_arg.trim()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "94e474cd-7eb5-4521-a60e-374d92026c15",
+ "metadata": {},
+ "source": [
+ "The `model`, `contig` and `samples` parameters should be obvious. Here we have also used best practice and used the \"Discrete-Time Wright Fisher\" ([`dtwf`](https://tskit.dev/msprime/docs/stable/ancestry.html#sec-ancestry-models-dtwf)) coalescent model for the most recent 20 generations, and the (default) [`hudson`](https://tskit.dev/msprime/docs/stable/ancestry.html#sec-ancestry-models-hudson) model further back in time (this is mainly important for whole genome simulations or when sample sizes appraoch the population size, see [this paper](https://doi.org/10.1371/journal.pgen.1008619)).\n",
+ "\n",
+ "Finally, any extra options are passed directly to the underlying simulation engine, in this case _msprime_. In particular, we have seen that the `coalescing_segments_only` argument will generate a partially- rather than fully-simplified ARG."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9559653-c777-4ddd-a3a8-2ddc6c52e76f",
+ "metadata": {},
+ "source": [
+ "- Exercise S
\n",
+ " - \n",
+ " The cell below creates a list of positions along the simulated genome that can be used as windows, and a mapping of population name to population ID. In the cell below that, get the samples that correspond to the central population by using
chimp_arg.samples(pop_ids['central']), and calculate the diversity in windows along the genome using win_div = chimp_arg.diversity(central_samples, windows=windows)
You should then be able to plot the diversity using plt.stairs(win_div, windows, baseline=None, label='central', color=colours['central'])
In the same cell, repeat for the bonobo, and western populations, and add plt.legend() to generate a legend.\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "734ead7e-c5ed-41c4-aa59-2776711f3229",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This code creates the genome windows and a map from population name to population ID.\n",
+ "windows = np.linspace(0, chimp_arg.sequence_length, 11)\n",
+ "print(f\"Created {len(windows)} windows of length {(windows[1]-windows[0]) / 1e6} megabases each\")\n",
+ "pop_ids = {pop.metadata[\"name\"]: pop.id for pop in chimp_arg.populations()}\n",
+ "print(\"Population IDs are:\", pop_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1f38f579-51a1-4fba-ad2c-ba559cc4f086",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot diversity in windows along the genome for the 'central' population, then add the two other populations\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95f18899-dc53-4ea5-949d-2f2695994f0b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q28.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1303d529-7e35-48f3-8638-6807dd3783bb",
+ "metadata": {},
+ "source": [
+ "### Coalescence rates\n",
+ "\n",
+ "A relatively new and useful _tskit_ feature is the ability to estimate coalescence rates (including cross-coalescence rates) through time. The key method here is pair_coalescence_rates() which estimates rates in time windows and also allows different rates to be estimated along the genome. In this case, as we have a neutral simulation, there's not much point using genome windows. If e.g. we expect selection, or want to inspect hybrids, we could easily plot out a heatmap instead."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c1df814-ecf1-462b-a327-d0a977f18cef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "num_log_timebins = 20\n",
+ "time_breaks = np.logspace(\n",
+ " 3, # start at 10^3 generations (not much info before that)\n",
+ " np.log10(chimp_arg.max_time),\n",
+ " num_log_timebins\n",
+ ")\n",
+ "plot_breaks = time_breaks.copy()\n",
+ "time_breaks[0] = 0 # we require the first bin to start at the sampling time\n",
+ "time_breaks[-1] = np.inf # we require the last bin to go to infinity\n",
+ "rates = chimp_arg.pair_coalescence_rates(time_windows=time_breaks)\n",
+ "plt.stairs(1/rates, plot_breaks)\n",
+ "plt.xscale(\"log\")\n",
+ "plt.xlabel(f\"Time ({chimp_arg.time_units})\")\n",
+ "plt.ylabel(f\"IICR (inverse instantaneous coal. rate)\");"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb8505a4-fcb9-4b2d-acbf-458d86be1dc6",
+ "metadata": {},
+ "source": [
+ "The \"huge\" IICR (sometimes interpreted as $N_e$) between 1000 and 10,000 generations probably just reflects the fact that these are separate species, and therefore not coalesing much at all. In this case the IICR"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "480bed8e-314e-4d73-90ec-6b873c9119c4",
+ "metadata": {},
+ "source": [
+ "#### Cross coalescence\n",
+ "If we want to look at migration *between* populations, we can look at the **cross coalescence rate**. For this we define some sets of samples (the samples in each population: central, bonobo, and western) and only record a pairwise coalescence if one of the pair is from population A and the other is from population B. Since we have 3 populations, there are only 3 possible pairs to look at:\n",
+ "\n",
+ "* central - bonobo\n",
+ "* central - western\n",
+ "* bonobo - western\n",
+ "\n",
+ "We specify which comparisons we want to look at by using indexes into the provided list of sample sets, i.e. in this case `(0, 1)`, `(0, 2)`, and `(1, 2)`. Note that below we plot the coalescence rate (not its inverse)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0f8389b5-a13b-4fba-a2f9-d19ea7fcbace",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "central_samples = chimp_arg.samples(pop_ids['central'])\n",
+ "bonobo_samples = chimp_arg.samples(pop_ids['bonobo'])\n",
+ "western_samples = chimp_arg.samples(pop_ids['western'])\n",
+ "rates = chimp_arg.pair_coalescence_rates(\n",
+ " time_windows=time_breaks,\n",
+ " sample_sets=[central_samples, bonobo_samples, western_samples],\n",
+ " indexes=[(0, 1), (0, 2), (1, 2)]\n",
+ ")\n",
+ "plt.stairs(rates[0, :], plot_breaks, label=\"central/bonobo\")\n",
+ "plt.stairs(rates[1, :], plot_breaks, label=\"central/western\")\n",
+ "plt.stairs(rates[2, :], plot_breaks, label=\"bonobo/western\")\n",
+ "plt.xscale(\"log\")\n",
+ "plt.xlabel(f\"Time ({chimp_arg.time_units})\")\n",
+ "plt.ylabel(f\"Estimated instantaneous coalecence rate\")\n",
+ "plt.legend();"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "90f0d4aa-283f-4bf0-90a2-eef717ea1b58",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q29.json\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54a65659-c5f3-4a61-981f-3a1fa3fed266",
+ "metadata": {},
+ "source": [
+ "### PCAs\n",
+ "\n",
+ "Another _tskit_ feature is the ability to perform efficient matrix-vector multiplications on the genetic relatedness matrix. This enables a very efficient implementation of genetic principle components analysis, even for huge datasets. Here's an example (requires tskit 1.0 or greater)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "94863202-171e-4bd6-bdef-cb239dd1d233",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "principal_components = chimp_arg.pca(2).factors\n",
+ "for pop in chimp_arg.populations():\n",
+ " samples = chimp_arg.samples(population=pop.id)\n",
+ " if len(samples) > 0:\n",
+ " plt.scatter(\n",
+ " principal_components[samples, 0],\n",
+ " principal_components[samples, 1],\n",
+ " c=colours[pop.metadata[\"name\"]],\n",
+ " label=pop.metadata[\"name\"],\n",
+ " alpha=0.5,\n",
+ " )\n",
+ " plt.xlabel(\"PCA 1\")\n",
+ " plt.ylabel(\"PCA 2\")\n",
+ " plt.legend()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ee747f06-3f52-4bf1-b1f5-e13426316307",
+ "metadata": {},
+ "source": [
+ "## SLiM simulations\n",
+ "\n",
+ "If you want to add complex selection or demography to a simulation (which can be done in _stdpopsim_), you will need to use a forward simulator like the SLiM simulation engine.\n",
+ "\n",
+ "SLiM is a separate program that won't work in the browser, so we can't perform forward-time simulations easily in this workbook. Instead, we'll load up the results of a previous SLiM simulation. Although _tskit_ `.trees` files are quite small, there is also an additional compression program, _tszip_, which specializes in making them even smaller. It's common to use this for saving and loading large ARGs in `.tsz` format, but _tszip_ will also quite happily load normal `.trees` files too, so it's not a bad idea to get use to using it to load any tree sequence:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2651eff-5f88-45f3-a5dd-844e6a76652c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tszip\n",
+ "# Running SLiM is an advanced topic beyond the scope of this workbook\n",
+ "slim_arg = tszip.load(\"data/SLiM_sim.tsz\")\n",
+ "\n",
+ "print(\n",
+ " f\"Loaded a SLiM simulated ARG with {slim_arg.num_samples} sampled genomes, \" +\n",
+ " f\"{slim_arg.num_trees} trees, \" +\n",
+ " f\"and {slim_arg.num_mutations} mutations.\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f4274bd-b2a9-427f-9374-f5449fb3a403",
+ "metadata": {},
+ "source": [
+ "- Exercise T
\n",
+ " - Get the first tree in the ARG using
slim_arg.first(), and then plot it .draw_svg(). As there are a lot of samples, you might also want to set size=(1000, 400) and node_labels={}.\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1c9ba680-137f-4219-adbb-b7cc03d27110",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise T here (plot the first tree in the slim_arg)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a693f6d-789b-433f-9481-c38a33c52728",
+ "metadata": {},
+ "source": [
+ "## Recapitation\n",
+ "\n",
+ "You can see that the simulation needed more burn-in, as the lineages have not all coalesced (in _tskit_ parlance, the trees have **multiple roots**). We can add the correct amount of (neutral) burn-in using _msprime_, which is usually much faster than SLiM. This is called [recapitation](https://tskit.dev/pyslim/docs/latest/tutorial.html). Here's a quick introduction.\n",
+ "\n",
+ "You can carry out recapitation using `msprime.sim_ancestry()`, but there is a convenient wrapper provided by the _pyslim_ Python library:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3d64f9ed-0fbc-464f-aa38-75852ea92cb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pyslim\n",
+ "\n",
+ "# ignore the warning the command below generates: see the docs\n",
+ "coalesced_arg = pyslim.recapitate(slim_arg, recombination_rate=1e-8, ancestral_Ne=200, random_seed=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "59a4e996-a83c-4993-a486-4ad8c438a560",
+ "metadata": {},
+ "source": [
+ "You can now check that the simulation has indeed fully coalesced:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0478332-2970-4c9f-8c95-714358bae5af",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check the first tree visually\n",
+ "coalesced_arg.first().draw_svg(size=(1000, 400), node_labels={})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca6cc43c-3223-4b4d-bb4f-9e37bea37980",
+ "metadata": {},
+ "source": [
+ "- Exercise U
\n",
+ " - It's all very well checking by eye, but you probably want to make sure that all the trees (not just the first) have coalesced. It's very useful to get into the habit of putting lots of
assert statements into your code, to sanity check your logic. An assert statement should always be True: your code will stop otherwise: it's is a great way to avoid mistakes.\n",
+ " Check that all the trees in the coalesced_arg have a single root by iterating through the trees and calling assert tree.num_roots == 1 for each tree.\n",
+ " \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f712d52b-a743-4282-aa2f-caa7b8e16d6a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Complete Exercise U here (put in an assert to check that recapitation has worked)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "82995d95-6d1d-4df0-8ef7-eacea2cde979",
+ "metadata": {},
+ "source": [
+ "Now that we have a fully coalescent ARG, we can e.g. add neutral mutations, to create realistic genetic variation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1a3cf07-b220-4eb9-ae71-50ccf0277c72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mu = 1e-8\n",
+ "mutated_slim_sim = msprime.sim_mutations(coalesced_arg, rate=mu, random_seed=123)\n",
+ "print(f\"Added {mutated_slim_sim.num_mutations} mutations to the ARG\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24c3f5cc-9fcd-4824-942e-9fe1fd9013b8",
+ "metadata": {},
+ "source": [
+ "Now that we have a fully coalesced ARG, we can calculate stats such as windowed diversity measures (both branch- and site-based):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d9308f2-4034-4078-b36e-0ba0ca61293a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "genome_windows = np.linspace(0, mutated_slim_sim.sequence_length, 1001)\n",
+ "step = genome_windows[1] - genome_windows[0]\n",
+ "site_diversity = mutated_slim_sim.diversity(windows=genome_windows)\n",
+ "branch_diversity = mutated_slim_sim.diversity(windows=genome_windows, mode=\"branch\")\n",
+ "scaled_branch_diversity = branch_diversity * mu\n",
+ "plt.title(f\"Genetic diversity in {step / 1000:.0f}kb genome windows\")\n",
+ "plt.stairs(site_diversity, genome_windows, baseline=None, ls=\":\", label=\"site-based\")\n",
+ "plt.stairs(scaled_branch_diversity, genome_windows, baseline=None, label=\"branch-based\")\n",
+ "plt.legend();"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fe3b94eb-5fa1-4a7f-9901-eabe5bda4fec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_quiz(Workbook2.url + \"Q30.json\", shuffle_answers=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5904c3f5-2315-49bb-a7d4-bb7ceb0844f6",
+ "metadata": {},
+ "source": [
+ "Recapitation can be a game-changer when simulating realistic populations, saving many days of simulation time. \n",
+ "\n",
+ "# More information\n",
+ "\n",
+ "That's the end of the workbook. Theere is a lot more official documentation at https://tskit.dev/tskit/docs/, and an ever-growing set of tutorials at https://tskit.dev/tutorials. More help to write tutorials is also most welcome! Good luck on your ARG journey."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/content/data/SLiM_sim.tsz b/content/data/SLiM_sim.tsz
new file mode 100644
index 0000000..a462ae2
Binary files /dev/null and b/content/data/SLiM_sim.tsz differ
diff --git a/content/data/Xpos.json b/content/data/Xpos.json
new file mode 100644
index 0000000..0778d99
--- /dev/null
+++ b/content/data/Xpos.json
@@ -0,0 +1 @@
+{"data": {"nodes": [{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":0,"ts_flags":1,"time":0,"child_of":[12],"parent_of":[],"x_pos_reference":-1,"label":"A","x_pos_01":0,"fx":100,"fy":450,"y":450,"index":0,"x":100,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":1,"ts_flags":1,"time":0,"child_of":[12],"parent_of":[],"x_pos_reference":-1,"label":"B","x_pos_01":0.11,"fx":144,"fy":450,"y":450,"index":1,"x":144,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":2,"ts_flags":1,"time":0,"child_of":[13],"parent_of":[],"x_pos_reference":-1,"label":"C","x_pos_01":0.2225,"fx":189,"fy":450,"y":450,"index":2,"x":189,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":3,"ts_flags":1,"time":0,"child_of":[13],"parent_of":[],"x_pos_reference":-1,"label":"D","x_pos_01":0.3325,"fx":233,"fy":450,"y":450,"index":3,"x":233,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":4,"ts_flags":1,"time":0,"child_of":[17],"parent_of":[],"x_pos_reference":-1,"label":"E","x_pos_01":0.445,"fx":278,"fy":450,"y":450,"index":4,"x":278,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":5,"ts_flags":1,"time":0,"child_of":[15],"parent_of":[],"x_pos_reference":-1,"label":"F","x_pos_01":0.555,"fx":322,"fy":450,"y":450,"index":5,"x":322,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":6,"ts_flags":1,"time":0,"child_of":[10],"parent_of":[],"x_pos_reference":-1,"label":"G","x_pos_01":0.6675,"fx":367,"fy":450,"y":450,"index":6,"x":367,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":7,"ts_flags":1,"time":0,"child_of":[16],"parent_of":[],"x_pos_reference":-1,"label":"H","x_pos_01":0.7775,"fx":411,"fy":450,"y":450,"index":7,"x":411,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":8,"ts_flags":1,"time":0,"child_of":[14],"parent_of":[],"x_pos_reference":-1,"label":"I","x_pos_01":0.89,"fx":456,"fy":450,"y":450,"index":8,"x":456,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolSquare","fill":"gray","stroke":"#053e4e","stroke_width":4,"id":9,"ts_flags":1,"time":0,"child_of":[14],"parent_of":[],"x_pos_reference":-1,"label":"J","x_pos_01":1,"fx":500,"fy":450,"y":450,"index":9,"x":500,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":10,"ts_flags":131072,"time":260.96659439832086,"child_of":[15,18],"parent_of":[6],"x_pos_reference":6,"label":"10/11","x_pos_01":0.6622832555898676,"fx":362.913302235947,"fy":430.9523809523809,"y":430.9523809523809,"index":10,"x":362.913302235947,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":12,"ts_flags":0,"time":285.00440802731634,"child_of":[24],"parent_of":[0,1],"x_pos_reference":-1,"label":"12","x_pos_01":0.0625,"fx":123,"fy":411.9047619047619,"y":411.9047619047619,"index":11,"x":123,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":13,"ts_flags":0,"time":701.5060898687332,"child_of":[22],"parent_of":[2,3],"x_pos_reference":-1,"label":"13","x_pos_01":0.33246119682279024,"fx":232.9844787291161,"fy":392.8571428571429,"y":392.8571428571429,"index":12,"x":232.9844787291161,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":14,"ts_flags":0,"time":1317.349207643829,"child_of":[16],"parent_of":[8,9],"x_pos_reference":-1,"label":"14","x_pos_01":0.9225,"fx":469,"fy":373.8095238095238,"y":373.8095238095238,"index":13,"x":469,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":15,"ts_flags":0,"time":2305.1656170029473,"child_of":[17],"parent_of":[5,10],"x_pos_reference":-1,"label":"15","x_pos_01":0.5754008461000589,"fx":330.16033844002357,"fy":354.76190476190476,"y":354.76190476190476,"index":14,"x":330.16033844002357,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":16,"ts_flags":0,"time":3328.6926345982015,"child_of":[19],"parent_of":[7,14],"x_pos_reference":-1,"label":"16","x_pos_01":0.8487065181366357,"fx":439.4826072546543,"fy":335.7142857142857,"y":335.7142857142857,"index":15,"x":439.4826072546543,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":17,"ts_flags":0,"time":5571.924375786866,"child_of":[18],"parent_of":[4,15],"x_pos_reference":-1,"label":"17","x_pos_01":0.540679575944617,"fx":316.2718303778468,"fy":316.6666666666667,"y":316.6666666666667,"index":16,"x":316.2718303778468,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":18,"ts_flags":0,"time":8071.453141015846,"child_of":[19],"parent_of":[10,17],"x_pos_reference":-1,"label":"18","x_pos_01":0.6877340952481648,"fx":364.0936380992659,"fy":297.6190476190476,"y":297.6190476190476,"index":17,"x":364.0936380992659,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":19,"ts_flags":0,"time":9353.57671961292,"child_of":[20],"parent_of":[16,18],"x_pos_reference":-1,"label":"19","x_pos_01":0.6998522306081094,"fx":379.94089224324375,"fy":278.57142857142856,"y":278.57142857142856,"index":18,"x":379.94089224324375,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":20,"ts_flags":131072,"time":12746.991597314627,"child_of":[22,23],"parent_of":[19],"x_pos_reference":19,"label":"20/21","x_pos_01":0.6832992736849367,"fx":358.3197094739747,"fy":259.5238095238095,"y":259.5238095238095,"index":19,"x":358.3197094739747,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":22,"ts_flags":0,"time":14855.606219936064,"child_of":[23],"parent_of":[13,20],"x_pos_reference":-1,"label":"22","x_pos_01":0.5250299801848809,"fx":287.01199207395234,"fy":240.4761904761905,"y":240.4761904761905,"index":20,"x":287.01199207395234,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":23,"ts_flags":0,"time":17943.121444938482,"child_of":[27],"parent_of":[20,22],"x_pos_reference":-1,"label":"23","x_pos_01":0.7319396507413578,"fx":352.7758602965431,"fy":221.42857142857144,"y":221.42857142857144,"index":21,"x":352.7758602965431,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":24,"ts_flags":131072,"time":26708.660591874304,"child_of":[26],"parent_of":[12],"x_pos_reference":12,"label":"24/25","x_pos_01":0.06,"fx":121,"fy":202.38095238095238,"y":202.38095238095238,"index":22,"x":121,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"cyan","stroke":"#053e4e","stroke_width":4,"id":26,"ts_flags":262144,"time":29380.27264413545,"child_of":[31],"parent_of":[24],"x_pos_reference":24,"label":"26","x_pos_01":0.065,"fx":120,"fy":183.33333333333334,"y":183.33333333333334,"index":23,"x":120,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":27,"ts_flags":131072,"time":31775.63631117215,"child_of":[29,35],"parent_of":[23],"x_pos_reference":23,"label":"27/28","x_pos_01":0.48308464065253004,"fx":319.233856261012,"fy":164.28571428571433,"y":164.28571428571433,"index":24,"x":319.233856261012,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":29,"ts_flags":131072,"time":36736.15530948005,"child_of":[31,34],"parent_of":[27],"x_pos_reference":-1,"label":"29/30","x_pos_01":0.31041285989931866,"fx":292.16514395972746,"fy":145.23809523809524,"y":145.23809523809524,"index":25,"x":292.16514395972746,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":31,"ts_flags":0,"time":38472.416155934014,"child_of":[32],"parent_of":[26,29],"x_pos_reference":-1,"label":"31","x_pos_01":0.0725,"fx":100,"fy":126.19047619047619,"y":126.19047619047619,"index":26,"x":100,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"red","stroke":"#053e4e","stroke_width":4,"id":32,"ts_flags":131072,"time":45017.20211161007,"child_of":[34,36],"parent_of":[31],"x_pos_reference":31,"label":"32/33","x_pos_01":0.1190921642475135,"fx":150.6368656990054,"fy":107.14285714285717,"y":107.14285714285717,"index":27,"x":150.6368656990054,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"blue","stroke":"#053e4e","stroke_width":4,"id":34,"ts_flags":0,"time":45633.2810080566,"child_of":[35],"parent_of":[29,32],"x_pos_reference":-1,"label":"34","x_pos_01":0.2524612411663699,"fx":227.98449646654797,"fy":88.09523809523813,"y":88.09523809523813,"index":28,"x":227.98449646654797,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"cyan","stroke":"#053e4e","stroke_width":4,"id":35,"ts_flags":262144,"time":49893.08581725717,"child_of":[36],"parent_of":[27,34],"x_pos_reference":-1,"label":"35","x_pos_01":0.49857760871801815,"fx":252.43104348720726,"fy":69.04761904761907,"y":69.04761904761907,"index":29,"x":252.43104348720726,"vy":0,"vx":0},{"size":150,"symbol":"d3.symbolCircle","fill":"green","stroke":"#053e4e","stroke_width":4,"id":36,"ts_flags":0,"time":53362.27669424039,"child_of":[],"parent_of":[32,35],"x_pos_reference":-1,"label":"36","x_pos_01":0.13839774126760815,"fx":254.35909650704326,"fy":50,"y":50,"index":30,"x":254.35909650704326,"vy":0,"vx":0}], "links": [{"id": 0, "source": 10, "source_time": 260.96659439832086, "target": 6, "target_time": 0.0, "bounds": "0.0-930.0 930.0-1000.0", "alt_parent": -1, "alt_child": -1, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 1, "source": 12, "source_time": 285.00440802731634, "target": 0, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 1, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 2, "source": 12, "source_time": 285.00440802731634, "target": 1, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 0, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 3, "source": 13, "source_time": 701.5060898687332, "target": 2, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 3, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 4, "source": 13, "source_time": 701.5060898687332, "target": 3, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 2, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 5, "source": 14, "source_time": 1317.349207643829, "target": 8, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 9, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 6, "source": 14, "source_time": 1317.349207643829, "target": 9, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 8, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 7, "source": 15, "source_time": 2305.1656170029473, "target": 5, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 10, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 8, "source": 15, "source_time": 2305.1656170029473, "target": 10, "target_time": 260.96659439832086, "bounds": "0.0-930.0", "alt_parent": 18, "alt_child": 5, "region_fraction": 0.93, "stroke": "#053e4e"}, {"id": 9, "source": 16, "source_time": 3328.6926345982015, "target": 7, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 14, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 10, "source": 16, "source_time": 3328.6926345982015, "target": 14, "target_time": 1317.349207643829, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 7, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 11, "source": 17, "source_time": 5571.924375786866, "target": 4, "target_time": 0.0, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 15, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 12, "source": 17, "source_time": 5571.924375786866, "target": 15, "target_time": 2305.1656170029473, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 4, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 13, "source": 18, "source_time": 8071.453141015846, "target": 10, "target_time": 260.96659439832086, "bounds": "930.0-1000.0", "alt_parent": 15, "alt_child": 17, "region_fraction": 0.07, "stroke": "#053e4e"}, {"id": 14, "source": 18, "source_time": 8071.453141015846, "target": 17, "target_time": 5571.924375786866, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 10, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 15, "source": 19, "source_time": 9353.57671961292, "target": 16, "target_time": 3328.6926345982015, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 18, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 16, "source": 19, "source_time": 9353.57671961292, "target": 18, "target_time": 8071.453141015846, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 16, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 17, "source": 20, "source_time": 12746.991597314627, "target": 19, "target_time": 9353.57671961292, "bounds": "0.0-758.0 758.0-1000.0", "alt_parent": -1, "alt_child": -1, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 18, "source": 22, "source_time": 14855.606219936064, "target": 13, "target_time": 701.5060898687332, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 20, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 19, "source": 22, "source_time": 14855.606219936064, "target": 20, "target_time": 12746.991597314627, "bounds": "0.0-758.0", "alt_parent": 23, "alt_child": 13, "region_fraction": 0.758, "stroke": "#053e4e"}, {"id": 20, "source": 23, "source_time": 17943.121444938482, "target": 20, "target_time": 12746.991597314627, "bounds": "758.0-1000.0", "alt_parent": 22, "alt_child": 22, "region_fraction": 0.242, "stroke": "#053e4e"}, {"id": 21, "source": 23, "source_time": 17943.121444938482, "target": 22, "target_time": 14855.606219936064, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 20, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 22, "source": 24, "source_time": 26708.660591874304, "target": 12, "target_time": 285.00440802731634, "bounds": "0.0-602.0 602.0-1000.0", "alt_parent": -1, "alt_child": -1, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 23, "source": 26, "source_time": 29380.27264413545, "target": 24, "target_time": 26708.660591874304, "bounds": "0.0-602.0", "alt_parent": 26, "alt_child": 24, "region_fraction": 0.602, "stroke": "#053e4e"}, {"id": 24, "source": 26, "source_time": 29380.27264413545, "target": 24, "target_time": 26708.660591874304, "bounds": "602.0-1000.0", "alt_parent": 26, "alt_child": 24, "region_fraction": 0.398, "stroke": "#053e4e"}, {"id": 25, "source": 27, "source_time": 31775.63631117215, "target": 23, "target_time": 17943.121444938482, "bounds": "0.0-865.0 865.0-1000.0", "alt_parent": -1, "alt_child": -1, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 26, "source": 29, "source_time": 36736.15530948005, "target": 27, "target_time": 31775.63631117215, "bounds": "0.0-70.0 70.0-865.0", "alt_parent": 35, "alt_child": -1, "region_fraction": 0.865, "stroke": "#053e4e"}, {"id": 27, "source": 31, "source_time": 38472.416155934014, "target": 26, "target_time": 29380.27264413545, "bounds": "0.0-1000.0", "alt_parent": -1, "alt_child": 29, "region_fraction": 1.0, "stroke": "#053e4e"}, {"id": 28, "source": 31, "source_time": 38472.416155934014, "target": 29, "target_time": 36736.15530948005, "bounds": "0.0-70.0", "alt_parent": 34, "alt_child": 26, "region_fraction": 0.07, "stroke": "#053e4e"}, {"id": 29, "source": 32, "source_time": 45017.20211161007, "target": 31, "target_time": 38472.416155934014, "bounds": "70.0-288.0 288.0-1000.0", "alt_parent": -1, "alt_child": -1, "region_fraction": 0.93, "stroke": "#053e4e"}, {"id": 30, "source": 34, "source_time": 45633.2810080566, "target": 29, "target_time": 36736.15530948005, "bounds": "70.0-865.0", "alt_parent": 31, "alt_child": 32, "region_fraction": 0.795, "stroke": "#053e4e"}, {"id": 31, "source": 34, "source_time": 45633.2810080566, "target": 32, "target_time": 45017.20211161007, "bounds": "70.0-288.0", "alt_parent": 36, "alt_child": 29, "region_fraction": 0.218, "stroke": "#053e4e"}, {"id": 32, "source": 35, "source_time": 49893.08581725717, "target": 27, "target_time": 31775.63631117215, "bounds": "865.0-1000.0", "alt_parent": 29, "alt_child": 34, "region_fraction": 0.135, "stroke": "#053e4e"}, {"id": 33, "source": 35, "source_time": 49893.08581725717, "target": 34, "target_time": 45633.2810080566, "bounds": "288.0-865.0", "alt_parent": -1, "alt_child": 27, "region_fraction": 0.577, "stroke": "#053e4e"}, {"id": 34, "source": 36, "source_time": 53362.27669424039, "target": 32, "target_time": 45017.20211161007, "bounds": "288.0-1000.0", "alt_parent": 34, "alt_child": 35, "region_fraction": 0.712, "stroke": "#053e4e"}, {"id": 35, "source": 36, "source_time": 53362.27669424039, "target": 35, "target_time": 49893.08581725717, "bounds": "288.0-1000.0", "alt_parent": -1, "alt_child": 32, "region_fraction": 0.712, "stroke": "#053e4e"}], "mutations": [], "breakpoints": [{"id": 0, "start": 0.0, "stop": 70.0, "x_pos_01": 0.0, "width_01": 0.07, "fill": "#053e4e", "x_pos": 50.0, "width": 35.0, "included": "true"}, {"id": 1, "start": 70.0, "stop": 288.0, "x_pos_01": 0.07, "width_01": 0.218, "fill": "#053e4e", "x_pos": 85.0, "width": 109.0, "included": "true"}, {"id": 2, "start": 288.0, "stop": 602.0, "x_pos_01": 0.288, "width_01": 0.314, "fill": "#053e4e", "x_pos": 194.0, "width": 157.0, "included": "true"}, {"id": 3, "start": 602.0, "stop": 758.0, "x_pos_01": 0.602, "width_01": 0.156, "fill": "#053e4e", "x_pos": 351.0, "width": 78.0, "included": "true"}, {"id": 4, "start": 758.0, "stop": 865.0, "x_pos_01": 0.758, "width_01": 0.107, "fill": "#053e4e", "x_pos": 429.0, "width": 53.5, "included": "true"}, {"id": 5, "start": 865.0, "stop": 930.0, "x_pos_01": 0.865, "width_01": 0.065, "fill": "#053e4e", "x_pos": 482.5, "width": 32.5, "included": "true"}, {"id": 6, "start": 930.0, "stop": 1000.0, "x_pos_01": 0.93, "width_01": 0.07, "fill": "#053e4e", "x_pos": 515.0, "width": 35.0, "included": "true"}], "evenly_distributed_positions": [100.0, 144.0, 189.0, 233.0, 278.0, 322.0, 367.0, 411.0, 456.0, 500.0]}, "width": 550, "height": 575, "y_axis": {"include_labels": "true", "ticks": [450.0, 430.9523809523809, 411.9047619047619, 392.8571428571429, 373.8095238095238, 354.76190476190476, 335.7142857142857, 316.6666666666667, 297.6190476190476, 278.57142857142856, 259.5238095238095, 240.4761904761905, 221.42857142857144, 202.38095238095238, 183.33333333333334, 164.28571428571433, 145.23809523809524, 126.19047619047619, 107.14285714285717, 88.09523809523813, 69.04761904761907, 50.0], "text": [0, 261, 285, 702, 1317, 2305, 3329, 5572, 8071, 9354, 12747, 14856, 17943, 26709, 29380, 31776, 36736, 38472, 45017, 45633, 49893, 53362], "max_min": [450.0, 50.0], "scale": "rank"}, "edges": {"type": "line", "variable_width": "false", "include_underlink": "true"}, "condense_mutations": "false", "label_mutations": "false", "tree_highlighting": "true", "title": "None", "rotate_tip_labels": "false", "plot_type": "full", "default_node_style": {"size": 150, "symbol": "d3.symbolCircle", "fill": "#1eebb1", "stroke": "#053e4e", "stroke_width": 4}}
\ No newline at end of file
diff --git a/content/data/example_ARG.trees b/content/data/example_ARG.trees
new file mode 100644
index 0000000..5ea2a8e
Binary files /dev/null and b/content/data/example_ARG.trees differ
diff --git a/content/json/WhatIsAnARG/Workbook1/Q1.json b/content/json/WhatIsAnARG/Workbook1/Q1.json
new file mode 100644
index 0000000..f116add
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q1.json
@@ -0,0 +1,8 @@
+[{
+ "question": "What genomic span is passed on to sample node 6 through the route via node 15? The span is the total length of the genome region passed on (i.e. the right minus the left position)",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 930, "correct": true, "feedback": "Yes, it covers positions 0 to 930."},
+ {"type": "default", "feedback": "Try hovering over the edges below node 15."}
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook1/Q10.json b/content/json/WhatIsAnARG/Workbook1/Q10.json
new file mode 100644
index 0000000..221f948
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q10.json
@@ -0,0 +1,28 @@
+[{
+ "question": "In the ARG above, what mutation groups together samples 7, 8, and 9?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "C mutates to A at position 215", "correct": true},
+ {"answer": "A mutates to C at position 215", "correct": false, "feedback": "You have the position right, but the derived state is A."},
+ {"answer": "G mutates to T at position 92", "correct": false},
+ {"answer": "C mutates to G at position 560", "correct": false}
+ ]}, {
+ "question": "What ARG node does that mutation allow us to infer?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 16, "correct": true, "feedback": "Yes, it's the node below the mutation."},
+ {"type": "value", "value": 19, "correct": false, "feedback": "No, that node groups together more than 7, 8, and 9."}
+ ]},{
+ "question": "Do any mutations allow us to resolve the relationship between samples 4, 5, and 6?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "No", "correct": true, "feedback": "In fact, we can't even resolve the relative closeness of 4 versus 5 versus 6 to the (7,8,9) group"},
+ {"answer": "Yes", "correct": false}
+ ]}, {
+ "question": "How many mutations are uninformative about the structure (topology) of the ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 2, "correct": true, "feedback": "Yes, the two so-called singletons, immediately above node 3 and node 4."},
+ {"type": "default", "feedback": "Incorrect (hint: mutations above a single node do not group samples together)."}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q11.json b/content/json/WhatIsAnARG/Workbook1/Q11.json
new file mode 100644
index 0000000..5d16cdd
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q11.json
@@ -0,0 +1,11 @@
+[{
+ "question": "What is the interpretation of the genetic diversity, π in terms of branch lengths?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "The average time of all internal nodes", "correct": false},
+ {"answer": "Twice the average TMRCA between all pairs of samples", "correct": true},
+ {"answer": "The average branch lengths between all pairs of samples", "correct": true},
+ {"answer": "The span-weighted average branch length of the local trees", "correct": false,
+ "feedback": "This is the equivalent of the number of segregating sites (under the infinite-sites model)"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q2.json b/content/json/WhatIsAnARG/Workbook1/Q2.json
new file mode 100644
index 0000000..de71b2f
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q2.json
@@ -0,0 +1,25 @@
+[{
+ "question": "What does SPR stand for?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Subtree Prune and Regraft", "correct": true},
+ {"answer": "Strand Pairing Resolution", "correct": false},
+ {"answer": "Single-Point Reversion", "correct": false},
+ {"answer": "Slippery Puzzle Reorganization", "correct": false, "feedback": "???."}
+ ]}, {
+ "question": "What is the TMRCA, or time to the most recent common ancestor (in generations) between nodes 0 and 9 at the left hand end of this ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 38472, "correct": true},
+ {"type": "value", "value": 31, "correct": false, "feedback": "That's the node ID. What time is associated with that node."},
+ {"type": "default", "feedback": "Try hovering over the last part of the genome, and looking at the axis labels."}
+ ]},{
+ "question": "What is the name of the structure in which two lineages split but immediately re-join, as seen just below node 26",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Diamond", "correct": true},
+ {"answer": "Loop", "correct": false},
+ {"answer": "Bubble", "correct": false, "feedback": "This term is sometimes used, but is not standard"},
+ {"answer": "Cycle", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q3.json b/content/json/WhatIsAnARG/Workbook1/Q3.json
new file mode 100644
index 0000000..b10f0e7
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q3.json
@@ -0,0 +1,8 @@
+[{
+ "question": "(Hard!) Only one recombination event results in a tree that shows a new topological relationship between the samples. By looking at the graph and the local trees, can you identify which one it is? Enter its breakpoint position below:",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 930, "correct": true, "feedback": "Yes (coincidentally this is the last breakpoint)"},
+ {"type": "default", "feedback": "Hint: look at the breakpoints on the X axis of the tree-by-tree plot: which marks the transition between two differently shaped trees?"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q4.json b/content/json/WhatIsAnARG/Workbook1/Q4.json
new file mode 100644
index 0000000..bca785a
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q4.json
@@ -0,0 +1,8 @@
+[{
+ "question": "The first two breakpoints along the genome happen to correspond to recombination at the top of the ARG. How many different root heights does this cause?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 3, "correct": true},
+ {"type": "default", "feedback": "Hint: look at the local trees"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q5.json b/content/json/WhatIsAnARG/Workbook1/Q5.json
new file mode 100644
index 0000000..4cc7aae
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q5.json
@@ -0,0 +1,30 @@
+[{
+ "question": "How would you classify the recombination node(s) 20/21",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Non-coalescent / always unary", "correct": true, "feedback": "By construction, in a full ARG recombination nodes are never coalescent"},
+ {"answer": "Partially-coalescent / locally unary", "correct": false},
+ {"answer": "All-coalescent / never unary", "correct": false}
+ ]},{
+ "question": "How would you classify node 17",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Non-coalescent / always unary", "correct": false},
+ {"answer": "Part-coalescent / locally unary", "correct": false},
+ {"answer": "All-coalescent / never unary", "correct": true}
+ ]},{
+ "question": "How would you classify node 26",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Non-coalescent / always unary", "correct": true, "feedback": "Some 'common ancestor' nodes with 2 children in the full ARG never represent local coalescence"},
+ {"answer": "Part-coalescent / locally unary", "correct": false},
+ {"answer": "All-coalescent / never unary", "correct": false}
+ ]},{
+ "question": "How would you classify node 15",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Non-coalescent / always unary", "correct": false},
+ {"answer": "Part-coalescent / locally unary", "correct": true, "feedback": "It is only coalescent on the left side of the genome: it is unary to the right of position 930"},
+ {"answer": "All-coalescent / never unary", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q6.json b/content/json/WhatIsAnARG/Workbook1/Q6.json
new file mode 100644
index 0000000..7ba220a
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q6.json
@@ -0,0 +1,8 @@
+[{
+ "question": "In the population-coloured ARG plot, what colour do you think represents nodes from the African population",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "green", "correct": true, "feedback": "Yes, the deepest divergences are African: in fact, even between the European genomes, some coalescences trace back into Africa"},
+ {"answer": "blue", "correct": false},
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q7.json b/content/json/WhatIsAnARG/Workbook1/Q7.json
new file mode 100644
index 0000000..e542509
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q7.json
@@ -0,0 +1,8 @@
+[{
+ "question": "What is the name of the first individual (ID=0), as stored in the individual's metadata?",
+ "type": "string",
+ "answers": [
+ {"answer": "Bob", "correct": true, "match_case": false},
+ {"answer": "Eve", "correct": false, "match_case": false, "feedback": "Incorrect: remember, tskit uses a zero-based numbering system, so the first individual has ID=0, not ID=1"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q8.json b/content/json/WhatIsAnARG/Workbook1/Q8.json
new file mode 100644
index 0000000..2a101b9
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q8.json
@@ -0,0 +1,8 @@
+[{
+ "question": "One of the sites has two mutations. Can you identify the position of that site by looking at both the tree-by-tree and the interactive ARG plot?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 5, "correct": true, "feedback": "Yes (coincidentally this is the first position)"},
+ {"type": "default", "feedback": "Hint: duplicate mutations will simultaneously be highlighted when hovering over them in the visualizer"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook1/Q9.json b/content/json/WhatIsAnARG/Workbook1/Q9.json
new file mode 100644
index 0000000..bea51e6
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook1/Q9.json
@@ -0,0 +1,27 @@
+[{
+ "question": "What is the allelic state of the sample with node ID 2 at position 5?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "A", "correct": false},
+ {"answer": "C", "correct": false},
+ {"answer": "G", "correct": false},
+ {"answer": "T", "correct": true}
+ ]}, {
+ "question": "What is the ancestral state at position 5?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "A", "correct": true},
+ {"answer": "C", "correct": false},
+ {"answer": "G", "correct": false},
+ {"answer": "T", "correct": false}
+ ]}, {
+ "question": "By looking at the ARG or tree visualizations, the mutation responsible for this state in sample 2 is above which node?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "12", "correct": false},
+ {"answer": "13", "correct": true},
+ {"answer": "31", "correct": false},
+ {"answer": "This allelic state is the ancestral state, so does not correspond to a mutation in the ARG ", "correct": false,
+ "feedback": "It is true that ancestral states are not represented by a mutation in the ARG, but the state C here is a derived state, so it *is* associated with a mutation"}
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook2/Q12.json b/content/json/WhatIsAnARG/Workbook2/Q12.json
new file mode 100644
index 0000000..e366457
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q12.json
@@ -0,0 +1,8 @@
+[{
+ "question": "In the diamond in our example ARG, what is the ID of the node representing its common ancestor (CA) event?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 26, "correct": true},
+ {"type": "default"}
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook2/Q13.json b/content/json/WhatIsAnARG/Workbook2/Q13.json
new file mode 100644
index 0000000..60c08f4
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q13.json
@@ -0,0 +1,32 @@
+[{
+ "question": "What colour are all-coalescent nodes in the graph above?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Green", "correct": true},
+ {"answer": "Blue", "correct": false},
+ {"answer": "Red", "correct": false},
+ {"answer": "Cyan", "correct": false},
+ ]},{
+ "question": "What colour are partly-coalescent nodes in the graph above?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Green", "correct": false},
+ {"answer": "Blue", "correct": true},
+ {"answer": "Red", "correct": false},
+ {"answer": "Cyan", "correct": false},
+ ]},{
+ "question": "What colour are non-coalescent CA nodes in the graph above?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Green", "correct": false},
+ {"answer": "Blue", "correct": false},
+ {"answer": "Red", "correct": false, "feedback": "Not quite: red nodes are indeed non-coalescent, but they are RE nodes"},
+ {"answer": "Cyan", "correct": true, "feedback": "Correct (the other non-coalescent nodes are the red RE nodes)"},
+ ]},{
+ "question": "Are non-coalescent CA event nodes always above a diamond?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "No", "correct": true, "feedback": "Node 35 is not above a diamond, and it is non-coalescent in every local tree in which it is seen."},
+ {"answer": "Yes", "correct": false},
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q14.json b/content/json/WhatIsAnARG/Workbook2/Q14.json
new file mode 100644
index 0000000..00f44a9
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q14.json
@@ -0,0 +1,28 @@
+[{
+ "question": "What has happened to the diamond and the other non-coalescent CA node?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "They have been bypassed entirely", "correct": true},
+ {"answer": "They have been kept", "correct": false},
+ {"answer": "They are now connected to other nodes", "correct": false}
+ ]}, {
+ "question": "Other than those cases, what has happened to the edges descending from the parents of the red recombination nodes?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "They now bypass the recombination node and link to the next (blue or green) node below", "correct": true},
+ {"answer": "They have been deleted entirely, and not replaced", "correct": false},
+ {"answer": "They have been shortened", "correct": false}
+ ]}, {
+ "question": "What is the largest number of parents of any node in the new ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 3, "correct": true, "feedback": "Correct: node 23 now represents where two recombination events have been collapsed into a single node below"},
+ {"type": "default"}
+ ]}, {
+ "question": "What is the largest number of children of any node in the new ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 3, "correct": true, "feedback": "Correct: the root node 36 now represents where two coalescent events have been collapsed into a single node above"},
+ {"type": "default"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q15.json b/content/json/WhatIsAnARG/Workbook2/Q15.json
new file mode 100644
index 0000000..3795e85
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q15.json
@@ -0,0 +1,16 @@
+[{
+ "question": "After completely deleting unknowable nodes, what is the ID of the node with 3 children in the ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 22, "correct": true, "feedback": "Correct. However, note that it still only has 2 children in any one local tree"},
+ {"type": "value", "value": 36, "correct": false, "feedback": "That's the old node ID: you need to simplify without the filter_nodes=False option"},
+ {"type": "default"}
+ ]}, {
+ "question": "After completely deleting unknowable nodes, what is the ID of the node with 3 parents in the ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 19, "correct": true, "feedback": "Correct. However, note that it's still the case that all nodes only have one parent in any local tree"},
+ {"type": "value", "value": 22, "correct": false, "feedback": "That's the old node ID: you need to simplify without the filter_nodes=False option"},
+ {"type": "default"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q16.json b/content/json/WhatIsAnARG/Workbook2/Q16.json
new file mode 100644
index 0000000..33edb32
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q16.json
@@ -0,0 +1,13 @@
+[{
+ "question": "In the simplified ARG, do we know the order of the two recombination events immediately above node 19",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "No", "correct": true},
+ {"answer": "Yes", "correct": false},
+ ]}, {
+ "question": "In the simplified ARG, do we know the order of the two coalescent events immediately below the root node 22",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "No", "correct": true},
+ {"answer": "Yes", "correct": false},
+ ]}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q17.json b/content/json/WhatIsAnARG/Workbook2/Q17.json
new file mode 100644
index 0000000..6c937e7
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q17.json
@@ -0,0 +1,14 @@
+[{
+ "question": "Previously, there were 7 local trees corresponding to 6 recombination breakpoints. How many breakpoints are now represented?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 5, "correct": true, "feedback": "Correct: the recombination at position 602, associated with the diamond, has been removed."},
+ {"type": "default"}
+ ]}, {
+ "question": "Previously we had 13 mutations at twelve sites. How many mutations are there in the simplified ARG?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 13, "correct": true, "feedback": "Correct: simplification doesn't change the sites or the number / age of mutations"},
+ {"type": "default"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q18.json b/content/json/WhatIsAnARG/Workbook2/Q18.json
new file mode 100644
index 0000000..aa3db76
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q18.json
@@ -0,0 +1,23 @@
+[{
+ "question": "Compared to the partially simplified ARG, has the number of EDGES in the fully simplified ARG increased, decreased, or stayed the same?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Increased", "correct": true, "feedback": "Correct: full simplification can actually increase the number of ARG edges"},
+ {"answer": "Decreased", "correct": false},
+ {"answer": "Stayed the same", "correct": false}
+ ]}, {
+ "question": "Compared to the partially simplified ARG, has the number of NODES in the fully simplified ARG increased, decreased, or stayed the same?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Increased", "correct": false},
+ {"answer": "Decreased", "correct": false},
+ {"answer": "Stayed the same", "correct": true, "feedback": "Correct: we have not deleted any nodes from the ARG, simply bypassed them if they appear unary in a local tree"}
+ ]}, {
+ "question": "Compared to the partially simplified ARG, has the number of MUTATIONS in the fully simplified ARG increased, decreased, or stayed the same?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Increased", "correct": false},
+ {"answer": "Decreased", "correct": false},
+ {"answer": "Stayed the same", "correct": true, "feedback": "Correct, simplification doesn't change the pattern of encoded variation"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q19.json b/content/json/WhatIsAnARG/Workbook2/Q19.json
new file mode 100644
index 0000000..75c1100
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q19.json
@@ -0,0 +1,9 @@
+[{
+ "question": "In general, when you go backwards in time, does the span of an ancestral haplotype in an ARG increase, decrease, or stay about the same?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Span decreases (gets shorter)", "correct": true, "feedback": "Correct: older haplotypes will have gone through more recombination events, and so the genomic region passed to the current-day samples will have been whittled down in length"},
+ {"answer": "Span increases (gets longer)", "correct": false},
+ {"answer": "Span stays about the same", "correct": false}
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook2/Q20.json b/content/json/WhatIsAnARG/Workbook2/Q20.json
new file mode 100644
index 0000000..070da33
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q20.json
@@ -0,0 +1,8 @@
+[{
+ "question": "How many sites now have no mutations associated with them?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 1, "correct": true, "feedback": "Correct: the site at position 215 has no mutations."},
+ {"type": "default"}
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook2/Q21.json b/content/json/WhatIsAnARG/Workbook2/Q21.json
new file mode 100644
index 0000000..6f361c5
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q21.json
@@ -0,0 +1,9 @@
+[{
+ "question": "Does this ARG with polytomies encode the same genetic data?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Yes: there is no change to the genetic data encoded", "correct": true},
+ {"answer": "No", "correct": false},
+ {"answer": "Sometimes", "correct": false},
+ ]
+}]
diff --git a/content/json/WhatIsAnARG/Workbook2/Q22.json b/content/json/WhatIsAnARG/Workbook2/Q22.json
new file mode 100644
index 0000000..64c6076
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q22.json
@@ -0,0 +1,12 @@
+[{
+ "question": "By looking at the provenance commands, which simulation software was used to make this ARG?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "msprime", "correct": true},
+ {"answer": "SLiM", "correct": false},
+ {"answer": "fwdpy11", "correct": false},
+ {"answer": "simuPOP", "correct": false},
+ {"answer": "scrm", "correct": false},
+ {"answer": "fastsimcoal2", "correct": false},
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q23.json b/content/json/WhatIsAnARG/Workbook2/Q23.json
new file mode 100644
index 0000000..52d85df
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q23.json
@@ -0,0 +1,8 @@
+[{
+ "question": "From the log time plot, from when (in generations ago) does the exponential growth phase in both populations start model start, according to this model?",
+ "type": "numeric",
+ "answers": [
+ {"type": "value", "value": 200, "correct": true},
+ {"type": "default", "feedback": "Round to the nearest 100 generations"}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q24.json b/content/json/WhatIsAnARG/Workbook2/Q24.json
new file mode 100644
index 0000000..1220e85
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q24.json
@@ -0,0 +1,33 @@
+[{
+ "question": "What percentage of the nodes in the 1000 sample + 10 megabase full ARG are non-coalescent ('unknowable')?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "0% - 5%", "correct": false},
+ {"answer": "5% - 10%", "correct": false},
+ {"answer": "10% - 25%", "correct": false},
+ {"answer": "25% - 50%", "correct": false},
+ {"answer": "50% - 70%", "correct": false},
+ {"answer": "75% - 90%", "correct": false},
+ {"answer": "90% - 95%", "correct": false},
+ {"answer": "95% - 100%", "correct": true}
+ ]}, {
+ "question": "Roughly what fraction of the nodes are recombination nodes?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "1/10", "correct": false},
+ {"answer": "1/3", "correct": false},
+ {"answer": "1/2", "correct": false},
+ {"answer": "2/3", "correct": true},
+ {"answer": "9/10", "correct": false}
+ ]}, {
+ "question": "Roughly how big is this 1000 sample + 10 megabase full ARG when stored in tskit?",
+ "type": "numeric",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "~1 GB", "correct": false},
+ {"answer": "~250 MB", "correct": true},
+ {"answer": "~25 MB", "correct": false},
+ {"answer": "~10 MB", "correct": false},
+ {"answer": "~5 MB", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q25.json b/content/json/WhatIsAnARG/Workbook2/Q25.json
new file mode 100644
index 0000000..75e02b5
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q25.json
@@ -0,0 +1,27 @@
+[{
+ "question": "Are any non-coalescent ('unknowable') nodes present?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Yes", "correct": false},
+ {"answer": "No", "correct": true}
+ ]}, {
+ "question": "Roughly how big is this 1000 sample + 10 megabase simplified ARG when stored in tskit?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "1 GB", "correct": false},
+ {"answer": "250 MB", "correct": false},
+ {"answer": "25 MB", "correct": false},
+ {"answer": "10 MB", "correct": true},
+ {"answer": "5 MB", "correct": false}
+ ]}, {
+ "question": "How much smaller is the all-coalescent ARG than the full ARG?",
+ "type": "numeric",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "~100 times smaller", "correct": false},
+ {"answer": "~25 times smaller", "correct": true, "feedback": "Correct. Notice that the full ARG has 25 times more nodes too."},
+ {"answer": "~10 times smaller", "correct": false},
+ {"answer": "~2 times smaller", "correct": false},
+ {"answer": "About the same size", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q26.json b/content/json/WhatIsAnARG/Workbook2/Q26.json
new file mode 100644
index 0000000..9823161
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q26.json
@@ -0,0 +1,11 @@
+[{
+ "question": "How does the size of the inferred part_coalescent_arg compare to the all-coalescent (fully simplified) ARG?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "It much bigger", "correct": false},
+ {"answer": "It is about twice the size", "correct": false},
+ {"answer": "It is about the same size", "correct": false},
+ {"answer": "It is about half the size", "correct": true},
+ {"answer": "It is much smaller", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q27.json b/content/json/WhatIsAnARG/Workbook2/Q27.json
new file mode 100644
index 0000000..7174880
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q27.json
@@ -0,0 +1,11 @@
+[{
+ "question": "Which of these random placement methods is most appropriate for choosing a node above which to place a neutral mutation?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "With probability proportional to the span of the edge above the node", "correct": false},
+ {"answer": "With equal probability above any node", "correct": false},
+ {"answer": "Simply above the node with the longest branch", "correct": false},
+ {"answer": "With probability proportional to the temporal length (evolutionary time) of the edge above the node", "correct": false},
+ {"answer": "With probability proportional to the area (span × temporal length) of the edge above the node", "correct": true}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q28.json b/content/json/WhatIsAnARG/Workbook2/Q28.json
new file mode 100644
index 0000000..5d9a073
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q28.json
@@ -0,0 +1,15 @@
+[{
+ "question": "Which population has the highest diversity levels along the genome?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "central", "correct": true},
+ {"answer": "western", "correct": false},
+ {"answer": "bonobo", "correct": false},
+ ]}, {
+ "question": "Does the diversity vary across the genome in any of the populations?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "No", "correct": true, "feedback": "Correct: this is a neutral simulation with constant mutation rate, so we would not expect systematic variation in diversity across the genome."},
+ {"answer": "Yes", "correct": false}
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q29.json b/content/json/WhatIsAnARG/Workbook2/Q29.json
new file mode 100644
index 0000000..fba227d
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q29.json
@@ -0,0 +1,9 @@
+[{
+ "question": "What does the peak in central-western cross coalescence correspond to?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "The founding of the wester population at ~30,000 generations ago", "correct": true},
+ {"answer": "A sudden increase in the western population size", "correct": false},
+ {"answer": "A sudden drop in the central population size", "correct": false},
+ ]
+}]
\ No newline at end of file
diff --git a/content/json/WhatIsAnARG/Workbook2/Q30.json b/content/json/WhatIsAnARG/Workbook2/Q30.json
new file mode 100644
index 0000000..d5c41ef
--- /dev/null
+++ b/content/json/WhatIsAnARG/Workbook2/Q30.json
@@ -0,0 +1,13 @@
+[{
+ "question": "What, roughly, is the average genetic diversity (π) in the mutated ARG?",
+ "type": "many_choice",
+ "answers": [
+ {"answer": "Between 0 and 0.00001", "correct": false},
+ {"answer": "Between 0.00001 and 0.00002", "correct": false},
+ {"answer": "Between 0.00002 and 0.00003", "correct": true},
+ {"answer": "Between 0.00003 and 0.00004", "correct": false},
+ {"answer": "Between 0.00004 and 0.00005", "correct": false},
+ {"answer": "Between 0.00005 and 0.0001", "correct": false},
+ {"answer": "Greater than 0.0001", "correct": false}
+ ]
+}]
\ No newline at end of file