From d929456f3bf18b45a0db32e8bea2284073543644 Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Mon, 9 Mar 2026 13:25:40 +1100
Subject: [PATCH 1/8] Add risk-sensitive inventory management lecture and
 improve inventory_q

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/inventory_q.md       |  61 ++-
 lectures/rs_inventory_q.ipynb | 977 ++++++++++++++++++++++++++++++++++
 lectures/rs_inventory_q.md    | 761 ++++++++++++++++++++++++++
 lectures/rs_inventory_q.py    | 767 ++++++++++++++++++++++++++
 4 files changed, 2541 insertions(+), 25 deletions(-)
 create mode 100644 lectures/rs_inventory_q.ipynb
 create mode 100644 lectures/rs_inventory_q.md
 create mode 100644 lectures/rs_inventory_q.py

diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md
index 5640929a6..39740eb6b 100644
--- a/lectures/inventory_q.md
+++ b/lectures/inventory_q.md
@@ -37,7 +37,7 @@ We approach the problem in two ways.
 First, we solve it exactly using dynamic programming, assuming full knowledge of
 the model — the demand distribution, cost parameters, and transition dynamics.
 
-Second, we show how a manager can learn the optimal policy from experience alone, using *Q-learning*.
+Second, we show how a manager can learn the optimal policy from experience alone, using *[Q-learning](https://en.wikipedia.org/wiki/Q-learning)*.
 
 The manager observes only the inventory level, the order placed, the resulting
 profit, and the next inventory level — without knowing any of the underlying
@@ -50,6 +50,12 @@ transition function.
 We show that, given enough experience, the manager's learned policy converges to
 the optimal one.
 
+The lecture proceeds as follows:
+
+1. We set up the inventory model and solve it exactly via value function iteration.
+2. We introduce Q-factors and derive the Q-factor Bellman equation.
+3. We implement Q-learning and show the learned policy converges to the optimal one.
+
 We will use the following imports:
 
 ```{code-cell} ipython3
@@ -85,12 +91,14 @@ $$
     \qquad
     \text{where}
     \quad
-    h(x,a,d) := (x - d) \vee 0 + a.
+    h(x,a,d) := \max(x - d, 0) + a.
 $$
 
 The term $A_t$ is units of stock ordered this period, which arrive at the start
 of period $t+1$, after demand $D_{t+1}$ is realized and served.
 
+**Timeline for period $t$:** observe $X_t$ → choose $A_t$ → demand $D_{t+1}$ arrives → profit realized → $X_{t+1}$ determined.
+
 (We use a $t$ subscript in $A_t$ to indicate the information set: it is chosen
 before $D_{t+1}$ is observed.)
 
@@ -99,7 +107,7 @@ We assume that the firm can store at most $K$ items at one time.
 Profits are given by
 
 $$
-    \pi(X_t, A_t, D_{t+1}) := X_t \wedge D_{t+1} - c A_t - \kappa 1\{A_t > 0\}.
+    \pi(X_t, A_t, D_{t+1}) := \min(X_t, D_{t+1}) - c A_t - \kappa 1\{A_t > 0\}.
 $$
 
 Here
@@ -158,7 +166,7 @@ $$
       \sum_d \phi(d) \left[ \pi(x, a, d) + \beta \, v(h(x, a, d)) \right]
 $$
 
-When $r > 0$, the sequence $v_{k+1} = T v_k$ converges to the
+When $r > 0$ (equivalently, $\beta < 1$), the sequence $v_{k+1} = T v_k$ converges to the
 unique fixed point $v^*$, which is the value function of the optimal policy
 (see, e.g., {cite}`Sargent_Stachurski_2025`).
 
@@ -167,7 +175,7 @@ unique fixed point $v^*$, which is the value function of the optimal policy
 
 We store the model primitives in a `NamedTuple`.
 
-Demand follows a geometric distribution with parameter $p$, so $\phi(d) = (1 - p)^d \, p$ for $d = 0, 1, 2, \ldots$.
+Demand follows a [geometric distribution](https://en.wikipedia.org/wiki/Geometric_distribution) with parameter $p$, so $\phi(d) = (1 - p)^d \, p$ for $d = 0, 1, 2, \ldots$.
 
 ```{code-cell} ipython3
 class Model(NamedTuple):
@@ -427,7 +435,7 @@ transition function.
 
 ### The Q-learning update rule
 
-Q-learning approximates the fixed point of the Q-factor Bellman equation using **stochastic approximation**.
+Q-learning approximates the fixed point of the Q-factor Bellman equation using **[stochastic approximation](https://en.wikipedia.org/wiki/Stochastic_approximation)**.
 
 At each step, the agent is in state $x$, takes action $a$, observes reward
 $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$, and
@@ -443,8 +451,23 @@ where $\alpha_t$ is the learning rate.
 
 The update blends the current estimate $q_t(x, a)$ with a fresh sample of the Bellman target.
 
+### What the manager needs to know
+
+Notice what is **not** required to implement the update.
+
+The manager does not need to know the demand distribution $\phi$, the unit cost $c$, the fixed cost $\kappa$, or the transition function $h$.
+
+All the manager needs to observe at each step is:
+
+1. the current inventory level $x$,
+2. the order quantity $a$ they chose,
+3. the resulting profit $R_{t+1}$ (which appears on the books), and
+4. the next inventory level $X_{t+1}$ (which they can read off the warehouse).
+
+These are all directly observable quantities — no model knowledge is required.
+
 
-### The Q-table and the behavior policy
+### The Q-table and the role of the max
 
 It is important to understand how the update rule relates to the manager's
 actions.
@@ -489,7 +512,9 @@ By contrast, the original update with the $\max$ is a stochastic sample of the
 Bellman *optimality* operator, whose fixed point is $q^*$.  The $\max$ in the
 update target is therefore what drives convergence to $q^*$.
 
-**The behavior policy.**
+In short, the $\max$ is doing the work of finding the optimum; without it, you only evaluate a fixed policy.
+
+### The behavior policy
 
 The rule governing how the manager chooses actions is called the **behavior
 policy**.  Because the $\max$ in the update target always points toward $q^*$
@@ -511,26 +536,11 @@ In practice, we want the manager to mostly take good actions (to earn reasonable
 profits while learning), while still occasionally experimenting to discover
 better alternatives.
 
-### What the manager needs to know
-
-Notice what is **not** required to implement the update.
-
-The manager does not need to know the demand distribution $\phi$, the unit cost $c$, the fixed cost $\kappa$, or the transition function $h$.
-
-All the manager needs to observe at each step is:
-
-1. the current inventory level $x$,
-2. the order quantity $a$ they chose,
-3. the resulting profit $R_{t+1}$ (which appears on the books), and
-4. the next inventory level $X_{t+1}$ (which they can read off the warehouse).
-
-These are all directly observable quantities — no model knowledge is required.
-
 ### Learning rate
 
 We use $\alpha_t = 1 / n_t(x, a)^{0.51}$, where $n_t(x, a)$ is the number of times the pair $(x, a)$ has been visited up to time $t$.
 
-This decays slowly enough to allow learning from later (better-informed) updates, while still satisfying the Robbins-Monro conditions for convergence.
+This decays slowly enough to allow learning from later (better-informed) updates, while still satisfying the [Robbins–Monro conditions](https://en.wikipedia.org/wiki/Stochastic_approximation#Robbins%E2%80%93Monro_algorithm) for convergence.
 
 ### Exploration: epsilon-greedy
 
@@ -593,7 +603,7 @@ def q_learning_kernel(K, p, c, κ, β, n_steps, X_init,
             snapshots[snap_idx] = greedy_policy_from_q(q, K)
             snap_idx += 1
 
-        # === Observe outcome ===
+        # === Draw D_{t+1} and observe outcome ===
         d = np.random.geometric(p) - 1
         reward = min(x, d) - c * a - κ * (a > 0)
         x_next = max(x - d, 0) + a
@@ -661,6 +671,7 @@ and compare them against $v^*$ and $\sigma^*$ from VFI.
 
 ```{code-cell} ipython3
 K = len(x_values) - 1
+# restrict to feasible actions a ∈ {0, ..., K-x}
 v_q = np.array([np.max(q[x, :K - x + 1]) for x in range(K + 1)])
 σ_q = np.array([np.argmax(q[x, :K - x + 1]) for x in range(K + 1)])
 ```
diff --git a/lectures/rs_inventory_q.ipynb b/lectures/rs_inventory_q.ipynb
new file mode 100644
index 000000000..99dc4f676
--- /dev/null
+++ b/lectures/rs_inventory_q.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "17be60e4",
+   "metadata": {},
+   "source": [
+    "(rs_inventory_q)=\n",
+    "```{raw} jupyter\n",
+    "<div id=\"qe-notebook-header\" align=\"right\" style=\"text-align:right;\">\n",
+    "        <a href=\"https://quantecon.org/\" title=\"quantecon.org\">\n",
+    "                <img style=\"width:250px;display:inline;\" width=\"250px\" src=\"https://assets.quantecon.org/img/qe-menubar-logo.svg\" alt=\"QuantEcon\">\n",
+    "        </a>\n",
+    "</div>\n",
+    "```\n",
+    "\n",
+    "# Risk-Sensitive Inventory Management via Q-Learning\n",
+    "\n",
+    "```{contents} Contents\n",
+    ":depth: 2\n",
+    "```\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "In {ref}`inventory_q`, we looked at an inventory management\n",
+    "problem and solved it with both value function iteration and Q-learning.\n",
+    "\n",
+    "In this lecture, we consider a risk-sensitive variation.\n",
+    "\n",
+    "Injection of risk-sensitivity acknowledges the fact\n",
+    "that, in incomplete markets with financial and informational frictions, firms\n",
+    "typically take risk into account in their decision making.\n",
+    "\n",
+    "In other words, the actions of firms are not, in general, risk neutral.\n",
+    "\n",
+    "One natural way to handle this is to use a risk-sensitive version of the Bellman\n",
+    "equation.\n",
+    "\n",
+    "We show how the model can be solved using value function iteration.\n",
+    "\n",
+    "We then investigate how risk sensitivity affects the optimal policy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95f2dc54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import numba\n",
+    "import matplotlib.pyplot as plt\n",
+    "from typing import NamedTuple"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acc004dc",
+   "metadata": {},
+   "source": [
+    "## The Model\n",
+    "\n",
+    "The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form\n",
+    "\n",
+    "\n",
+    "$$\n",
+    "    v(x)\n",
+    "    = \\max_{a \\in \\Gamma(x)} \\mathbb E\n",
+    "    \\left[\n",
+    "        \\pi(x, a, D)\n",
+    "        + \\beta  v(h(x, a, D))\n",
+    "    \\right].\n",
+    "$$\n",
+    "\n",
+    "Here $D$ is a random variable with distribution $\\phi$.\n",
+    "\n",
+    "(Primitives and definitions are the same as in {ref}`inventory_q`.)\n",
+    "\n",
+    "The risk-sensitive version of this Bellman equation has the form\n",
+    "\n",
+    "$$\n",
+    "    v(x)\n",
+    "    = \\max_{a \\in \\Gamma(x)}\n",
+    "    \\phi^{-1}\n",
+    "    \\left\\{\n",
+    "        \\mathbb E \\phi\n",
+    "        \\left[\n",
+    "            \\pi(x, a, D)\n",
+    "            + \\beta  v(h(x, a, D))\n",
+    "        \\right]\n",
+    "    \\right\\},\n",
+    "$$\n",
+    "\n",
+    "where $\\phi(t) = \\exp(-\\gamma t)$ for fixed $\\gamma > 0$.\n",
+    "\n",
+    "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$, the Bellman equation becomes\n",
+    "\n",
+    "$$\n",
+    "    v(x)\n",
+    "    = \\max_{a \\in \\Gamma(x)}\n",
+    "    \\left(\n",
+    "        -\\frac{1}{\\gamma}\n",
+    "    \\right)\n",
+    "    \\ln\n",
+    "    \\left\\{\n",
+    "        \\sum_d \\phi(d) \\exp\n",
+    "        \\left[\n",
+    "            -\\gamma \\left( \\pi(x, a, d) + \\beta \\, v(h(x, a, d)) \\right)\n",
+    "        \\right]\n",
+    "    \\right\\}.\n",
+    "$$\n",
+    "\n",
+    "Here $\\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.\n",
+    "\n",
+    "The parameter $\\gamma$ controls the degree of risk sensitivity.\n",
+    "\n",
+    "As $\\gamma \\to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.\n",
+    "\n",
+    "Larger $\\gamma$ means more aversion to downside risk.\n",
+    "\n",
+    "The Bellman operator, greedy policy, and VFI algorithm all carry over from the\n",
+    "risk-neutral case, with the expectation replaced by the certainty equivalent.\n",
+    "\n",
+    "\n",
+    "\n",
+    "## Solving via Value Function Iteration\n",
+    "\n",
+    "### Model specification\n",
+    "\n",
+    "We reuse the same model primitives as in {ref}`inventory_q`, adding $\\gamma$ as a parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7909494",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class RSModel(NamedTuple):\n",
+    "    x_values: np.ndarray        # Inventory values\n",
+    "    d_values: np.ndarray        # Demand values for summation\n",
+    "    ϕ_values: np.ndarray        # Demand probabilities\n",
+    "    p: float                    # Demand parameter\n",
+    "    c: float                    # Unit cost\n",
+    "    κ: float                    # Fixed cost\n",
+    "    β: float                    # Discount factor\n",
+    "    γ: float                    # Risk-sensitivity parameter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8cf486f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_rs_inventory_model(\n",
+    "        K: int = 20,         # Max inventory\n",
+    "        D_MAX: int = 21,     # Demand upper bound for summation\n",
+    "        p: float = 0.7,\n",
+    "        c: float = 0.2,\n",
+    "        κ: float = 0.8,\n",
+    "        β: float = 0.98,\n",
+    "        γ: float = 1.0\n",
+    "    ) -> RSModel:\n",
+    "\n",
+    "    def demand_pdf(p, d):\n",
+    "        return (1 - p)**d * p\n",
+    "\n",
+    "    d_values = np.arange(D_MAX)\n",
+    "    ϕ_values = demand_pdf(p, d_values)\n",
+    "    x_values = np.arange(K + 1)\n",
+    "\n",
+    "    return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "060e0b9c",
+   "metadata": {},
+   "source": [
+    "### The Bellman operator\n",
+    "\n",
+    "The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.\n",
+    "\n",
+    "For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \\pi(x, a, d_i) + \\beta \\, v(h(x, a, d_i))$, we compute\n",
+    "\n",
+    "$$\n",
+    "    -\\frac{1}{\\gamma} \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i)\n",
+    "    \\;=\\;\n",
+    "    -\\frac{1}{\\gamma}\n",
+    "    \\left(\n",
+    "        m + \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i - m)\n",
+    "    \\right),\n",
+    "$$\n",
+    "\n",
+    "where $m = \\max_i (-\\gamma z_i)$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ecff396",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.jit(nopython=True)\n",
+    "def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n",
+    "    new_v = np.empty(K + 1)\n",
+    "    n_d = len(d_values)\n",
+    "    for x in range(K + 1):\n",
+    "        best = -np.inf\n",
+    "        for a in range(K - x + 1):\n",
+    "            # Compute -γ * z_i for each demand realization\n",
+    "            exponents = np.empty(n_d)\n",
+    "            for i in range(n_d):\n",
+    "                d = d_values[i]\n",
+    "                x_next = max(x - d, 0) + a\n",
+    "                revenue = min(x, d)\n",
+    "                cost = c * a + κ * (a > 0)\n",
+    "                z_i = revenue - cost + β * v[x_next]\n",
+    "                exponents[i] = -γ * z_i\n",
+    "            # Log-sum-exp trick for numerical stability\n",
+    "            m = np.max(exponents)\n",
+    "            weighted_sum = 0.0\n",
+    "            for i in range(n_d):\n",
+    "                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n",
+    "            val = -(1.0 / γ) * (m + np.log(weighted_sum))\n",
+    "            if val > best:\n",
+    "                best = val\n",
+    "        new_v[x] = best\n",
+    "    return new_v"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fdd1618",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def T_rs(v, model):\n",
+    "    \"\"\"The risk-sensitive Bellman operator.\"\"\"\n",
+    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
+    "    K = len(x_values) - 1\n",
+    "    return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03fbea31",
+   "metadata": {},
+   "source": [
+    "### Computing the greedy policy\n",
+    "\n",
+    "The greedy policy records the maximizing action instead of the maximized value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eeb7faf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.jit(nopython=True)\n",
+    "def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n",
+    "    σ = np.empty(K + 1, dtype=np.int32)\n",
+    "    n_d = len(d_values)\n",
+    "    for x in range(K + 1):\n",
+    "        best = -np.inf\n",
+    "        best_a = 0\n",
+    "        for a in range(K - x + 1):\n",
+    "            exponents = np.empty(n_d)\n",
+    "            for i in range(n_d):\n",
+    "                d = d_values[i]\n",
+    "                x_next = max(x - d, 0) + a\n",
+    "                revenue = min(x, d)\n",
+    "                cost = c * a + κ * (a > 0)\n",
+    "                z_i = revenue - cost + β * v[x_next]\n",
+    "                exponents[i] = -γ * z_i\n",
+    "            m = np.max(exponents)\n",
+    "            weighted_sum = 0.0\n",
+    "            for i in range(n_d):\n",
+    "                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n",
+    "            val = -(1.0 / γ) * (m + np.log(weighted_sum))\n",
+    "            if val > best:\n",
+    "                best = val\n",
+    "                best_a = a\n",
+    "        σ[x] = best_a\n",
+    "    return σ"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5473245",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_greedy_rs(v, model):\n",
+    "    \"\"\"Get a v-greedy policy for the risk-sensitive model.\"\"\"\n",
+    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
+    "    K = len(x_values) - 1\n",
+    "    return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a6a3b8be",
+   "metadata": {},
+   "source": [
+    "### Value function iteration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7ea93ea8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):\n",
+    "    v = v_init.copy()\n",
+    "    i, error = 0, tol + 1\n",
+    "\n",
+    "    while i < max_iter and error > tol:\n",
+    "        new_v = T_rs(v, model)\n",
+    "        error = np.max(np.abs(new_v - v))\n",
+    "        i += 1\n",
+    "        v = new_v\n",
+    "\n",
+    "    print(f\"Converged in {i} iterations with error {error:.2e}\")\n",
+    "\n",
+    "    σ = get_greedy_rs(v, model)\n",
+    "    return v, σ"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "962b1a57",
+   "metadata": {},
+   "source": [
+    "### Creating and solving an instance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6fe6712",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = create_rs_inventory_model()\n",
+    "x_values = model.x_values\n",
+    "n_x = len(x_values)\n",
+    "v_init = np.zeros(n_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3dc85cec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "v_star, σ_star = solve_rs_inventory_model(v_init, model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24295d26",
+   "metadata": {},
+   "source": [
+    "### Effect of risk sensitivity on the optimal policy\n",
+    "\n",
+    "We solve the model for several values of $\\gamma$ and compare the resulting policies.\n",
+    "\n",
+    "As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c599f91f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "γ_values = [0.01, 1.0, 2.0]\n",
+    "results = {}\n",
+    "\n",
+    "for γ in γ_values:\n",
+    "    mod = create_rs_inventory_model(γ=γ)\n",
+    "    v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)\n",
+    "    results[γ] = (v, σ)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfb69b17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n",
+    "\n",
+    "for γ in γ_values:\n",
+    "    v, σ = results[γ]\n",
+    "    axes[0].plot(x_values, v, label=f\"$\\\\gamma = {γ}$\")\n",
+    "    axes[1].plot(x_values, σ, label=f\"$\\\\gamma = {γ}$\")\n",
+    "\n",
+    "axes[0].set_xlabel(\"inventory\")\n",
+    "axes[0].set_ylabel(\"value\")\n",
+    "axes[0].legend()\n",
+    "axes[0].set_title(\"Value function\")\n",
+    "\n",
+    "axes[1].set_xlabel(\"inventory\")\n",
+    "axes[1].set_ylabel(\"order quantity\")\n",
+    "axes[1].legend()\n",
+    "axes[1].set_title(\"Policy\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55ed320c",
+   "metadata": {},
+   "source": [
+    "### Simulating the optimal policy\n",
+    "\n",
+    "We simulate inventory dynamics under the optimal policy for the baseline $\\gamma$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bfdd38c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.jit(nopython=True)\n",
+    "def sim_inventories(ts_length, σ, p, X_init=0):\n",
+    "    \"\"\"Simulate inventory dynamics under policy σ.\"\"\"\n",
+    "    X = np.zeros(ts_length, dtype=np.int32)\n",
+    "    X[0] = X_init\n",
+    "    for t in range(ts_length - 1):\n",
+    "        d = np.random.geometric(p) - 1\n",
+    "        X[t+1] = max(X[t] - d, 0) + σ[X[t]]\n",
+    "    return X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "993bf7d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(len(γ_values), 1,\n",
+    "                         figsize=(8, 2.0 * len(γ_values)),\n",
+    "                         sharex=True)\n",
+    "\n",
+    "ts_length = 200\n",
+    "sim_seed = 5678\n",
+    "K = len(x_values) - 1\n",
+    "\n",
+    "for i, γ in enumerate(γ_values):\n",
+    "    v, σ = results[γ]\n",
+    "    np.random.seed(sim_seed)\n",
+    "    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)\n",
+    "    axes[i].plot(X, alpha=0.7)\n",
+    "    axes[i].set_ylabel(\"inventory\")\n",
+    "    axes[i].set_title(f\"$\\\\gamma = {γ}$\")\n",
+    "    axes[i].set_ylim(0, K + 2)\n",
+    "\n",
+    "axes[-1].set_xlabel(r\"$t$\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f33d501a",
+   "metadata": {},
+   "source": [
+    "## Interpreting the Outcomes\n",
+    "\n",
+    "The plots above show that a more risk-sensitive firm (larger $\\gamma$) orders\n",
+    "less inventory and maintains lower stock levels.\n",
+    "\n",
+    "At first glance this may seem surprising: wouldn't holding more inventory\n",
+    "reduce variance by ensuring the firm can always meet demand?\n",
+    "\n",
+    "The key is to identify where the randomness in profits actually comes from.\n",
+    "\n",
+    "Recall that per-period profit is $\\pi(x, a, d) = \\min(x, d) - ca - \\kappa\n",
+    "\\mathbf{1}\\{a > 0\\}$.\n",
+    "\n",
+    "The ordering cost $ca + \\kappa \\mathbf{1}\\{a > 0\\}$ is **deterministic** — it\n",
+    "is chosen before the demand shock is realized.\n",
+    "\n",
+    "So higher ordering shifts the level of profits down but does not affect their\n",
+    "variance.\n",
+    "\n",
+    "The variance comes from **revenue**: $\\min(x, D)$.\n",
+    "\n",
+    "When inventory $x$ is high, $\\min(x, D) \\approx D$ for most demand\n",
+    "realizations — revenue inherits the full variance of demand.\n",
+    "\n",
+    "When inventory $x$ is low, $\\min(x, D) \\approx x$ for most realizations —\n",
+    "revenue is nearly deterministic, capped at the inventory level.\n",
+    "\n",
+    "A risk-sensitive agent therefore prefers lower inventory because it **caps the\n",
+    "randomness of revenue**.\n",
+    "\n",
+    "The agent accepts lower expected sales in exchange for more predictable profits.\n",
+    "\n",
+    "There is also a continuation value channel: next-period inventory $\\max(x - D,\n",
+    "0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more\n",
+    "closely, propagating that variance forward through $v$.\n",
+    "\n",
+    "\n",
+    "## Q-Learning\n",
+    "\n",
+    "We now ask whether the optimal policy can be learned without knowledge of the\n",
+    "model, as we did in the risk-neutral case in {ref}`inventory_q`.\n",
+    "\n",
+    "### The Q-factor\n",
+    "\n",
+    "The first step is to define the Q-factor in a way that is compatible with the\n",
+    "risk-sensitive Bellman equation.\n",
+    "\n",
+    "We define\n",
+    "\n",
+    "$$\n",
+    "    q(x, a) := \\mathbb E\n",
+    "    \\left[\n",
+    "        \\exp\\!\\left(\n",
+    "            -\\gamma \\left( \\pi(x, a, D) + \\beta \\, v^*(h(x, a, D)) \\right)\n",
+    "        \\right)\n",
+    "    \\right].\n",
+    "$$\n",
+    "\n",
+    "In words, $q(x, a)$ applies the risk-sensitivity transformation $\\phi(t) =\n",
+    "\\exp(-\\gamma t)$ inside the expectation, evaluated at the return from taking\n",
+    "action $a$ in state $x$ and following the optimal policy thereafter.\n",
+    "\n",
+    "### Deriving the Q-factor Bellman equation\n",
+    "\n",
+    "Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.\n",
+    "\n",
+    "**Step 1.** Express $v^*$ in terms of $q$.\n",
+    "\n",
+    "The risk-sensitive Bellman equation says $v^*(x) = \\max_{a \\in \\Gamma(x)}\n",
+    "\\phi^{-1}(q(x, a))$.\n",
+    "\n",
+    "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$ is a **decreasing** function,\n",
+    "the maximum over $a$ of $\\phi^{-1}(q(x, a))$ corresponds to the **minimum**\n",
+    "over $a$ of $q(x, a)$:\n",
+    "\n",
+    "$$\n",
+    "    v^*(x)\n",
+    "    = \\phi^{-1}\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right)\n",
+    "    = -\\frac{1}{\\gamma} \\ln\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right).\n",
+    "$$\n",
+    "\n",
+    "Equivalently,\n",
+    "\n",
+    "$$\n",
+    "    \\exp(-\\gamma \\, v^*(x)) = \\min_{a \\in \\Gamma(x)} q(x, a).\n",
+    "$$\n",
+    "\n",
+    "**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.\n",
+    "\n",
+    "Expanding the exponential in the definition of $q$,\n",
+    "\n",
+    "$$\n",
+    "    q(x, a)\n",
+    "    = \\mathbb E\n",
+    "    \\left[\n",
+    "        \\exp(-\\gamma \\, \\pi(x, a, D))\n",
+    "        \\;\\cdot\\;\n",
+    "        \\exp\\!\\left(-\\gamma \\beta \\, v^*(x')\\right)\n",
+    "    \\right]\n",
+    "$$\n",
+    "\n",
+    "where $x' = h(x, a, D)$.\n",
+    "\n",
+    "From Step 1, $\\exp(-\\gamma \\, v^*(x')) = \\min_{a' \\in \\Gamma(x')} q(x', a')$,\n",
+    "so $\\exp(-\\gamma \\beta \\, v^*(x')) = \\left[\\min_{a' \\in \\Gamma(x')} q(x',\n",
+    "a')\\right]^\\beta$.\n",
+    "\n",
+    "Substituting,\n",
+    "\n",
+    "$$\n",
+    "    q(x, a)\n",
+    "    = \\mathbb E\n",
+    "    \\left[\n",
+    "        \\exp(-\\gamma \\, \\pi(x, a, D))\n",
+    "        \\;\\cdot\\;\n",
+    "        \\left(\\min_{a' \\in \\Gamma(x')} q(x', a')\\right)^\\beta\n",
+    "    \\right].\n",
+    "$$\n",
+    "\n",
+    "This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.\n",
+    "\n",
+    "### The Q-learning update rule\n",
+    "\n",
+    "As in the risk-neutral case, we approximate the fixed point using stochastic\n",
+    "approximation.\n",
+    "\n",
+    "At each step, the agent is in state $x$, takes action $a$, observes profit\n",
+    "$R_{t+1} = \\pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,\n",
+    "and updates\n",
+    "\n",
+    "$$\n",
+    "    q_{t+1}(x, a)\n",
+    "    = (1 - \\alpha_t) \\, q_t(x, a)\n",
+    "    + \\alpha_t\n",
+    "    \\left[\n",
+    "        \\exp(-\\gamma \\, R_{t+1})\n",
+    "        \\;\\cdot\\;\n",
+    "        \\left(\\min_{a' \\in \\Gamma(X_{t+1})} q_t(X_{t+1}, a')\\right)^\\beta\n",
+    "    \\right].\n",
+    "$$\n",
+    "\n",
+    "The term in brackets is a single-sample estimate of the right-hand side of the\n",
+    "Q-factor Bellman equation.\n",
+    "\n",
+    "The update blends the current estimate with this fresh sample, just as in\n",
+    "standard Q-learning.\n",
+    "\n",
+    "Notice several differences from the risk-neutral case:\n",
+    "\n",
+    "- The Q-values are **positive** (expectations of exponentials) rather than signed.\n",
+    "- The optimal policy is $\\sigma(x) = \\arg\\min_a q(x, a)$ — we **minimize**\n",
+    "  rather than maximize, because $\\phi^{-1}$ is decreasing.\n",
+    "- The observed profit enters through $\\exp(-\\gamma R_{t+1})$ rather than\n",
+    "  additively.\n",
+    "- The continuation value enters as a **power** $(\\min_{a'} q_t)^\\beta$ rather\n",
+    "  than a scaled sum $\\beta \\cdot \\max_{a'} q_t$.\n",
+    "\n",
+    "As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and\n",
+    "$X_{t+1}$ — no model knowledge is required.\n",
+    "\n",
+    "### Implementation plan\n",
+    "\n",
+    "Our implementation follows the same structure as the risk-neutral Q-learning in\n",
+    "{ref}`inventory_q`, with the modifications above:\n",
+    "\n",
+    "1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and\n",
+    "   visit counts $n$ to zeros.\n",
+    "2. **At each step:**\n",
+    "   - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state\n",
+    "     $X_{t+1}$.\n",
+    "   - Compute $\\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a\n",
+    "     scalar for the update target, and the $\\arg\\min$ action is used by the\n",
+    "     $\\varepsilon$-greedy behavior policy).\n",
+    "   - Update $q_t(x, a)$ using the rule above, with learning rate\n",
+    "     $\\alpha_t = 1 / n_t(x, a)^{0.51}$.\n",
+    "   - Choose the next action via $\\varepsilon$-greedy: with probability\n",
+    "     $\\varepsilon$ pick a random feasible action, otherwise pick the\n",
+    "     $\\arg\\min$ action.\n",
+    "   - Decay $\\varepsilon$.\n",
+    "3. **Extract the greedy policy** from the final Q-table via\n",
+    "   $\\sigma(x) = \\arg\\min_{a \\in \\Gamma(x)} q(x, a)$.\n",
+    "4. **Compare** the learned policy against the VFI solution.\n",
+    "\n",
+    "### Implementation\n",
+    "\n",
+    "We first define a helper to extract the greedy policy from the Q-table.\n",
+    "\n",
+    "Since the optimal policy minimizes $q$, we use $\\arg\\min$ rather than $\\arg\\max$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed938977",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.jit(nopython=True)\n",
+    "def greedy_policy_from_q_rs(q, K):\n",
+    "    \"\"\"Extract greedy policy from risk-sensitive Q-table (argmin).\"\"\"\n",
+    "    σ = np.empty(K + 1, dtype=np.int32)\n",
+    "    for x in range(K + 1):\n",
+    "        best_val = np.inf\n",
+    "        best_a = 0\n",
+    "        for a in range(K - x + 1):\n",
+    "            if q[x, a] < best_val:\n",
+    "                best_val = q[x, a]\n",
+    "                best_a = a\n",
+    "        σ[x] = best_a\n",
+    "    return σ"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db9043ac",
+   "metadata": {},
+   "source": [
+    "The Q-learning loop mirrors the risk-neutral version, with the key changes:\n",
+    "Q-table initialized to ones, the update target uses $\\exp(-\\gamma R_{t+1})\n",
+    "\\cdot (\\min_{a'} q_t)^\\beta$, and the behavior policy follows the $\\arg\\min$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73fcd434",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@numba.jit(nopython=True)\n",
+    "def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n",
+    "                         ε_init, ε_min, ε_decay, snapshot_steps):\n",
+    "    q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1\n",
+    "    n = np.zeros((K + 1, K + 1))        # visit counts for learning rate\n",
+    "    ε = ε_init\n",
+    "\n",
+    "    n_snaps = len(snapshot_steps)\n",
+    "    snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)\n",
+    "    snap_idx = 0\n",
+    "\n",
+    "    # Initialize state and action\n",
+    "    x = X_init\n",
+    "    a = np.random.randint(0, K - x + 1)\n",
+    "\n",
+    "    for t in range(n_steps):\n",
+    "        # Record policy snapshot if needed\n",
+    "        if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:\n",
+    "            snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)\n",
+    "            snap_idx += 1\n",
+    "\n",
+    "        # === Draw D_{t+1} and observe outcome ===\n",
+    "        d = np.random.geometric(p) - 1\n",
+    "        reward = min(x, d) - c * a - κ * (a > 0)\n",
+    "        x_next = max(x - d, 0) + a\n",
+    "\n",
+    "        # === Min over next state (scalar value for update target) ===\n",
+    "        # Also record the argmin action for use by the behavior policy.\n",
+    "        best_next = np.inf\n",
+    "        a_next = 0\n",
+    "        for aa in range(K - x_next + 1):\n",
+    "            if q[x_next, aa] < best_next:\n",
+    "                best_next = q[x_next, aa]\n",
+    "                a_next = aa\n",
+    "\n",
+    "        # === Risk-sensitive Q-learning update ===\n",
+    "        target = np.exp(-γ * reward) * best_next ** β\n",
+    "        n[x, a] += 1\n",
+    "        α = 1.0 / n[x, a] ** 0.51\n",
+    "        q[x, a] = (1 - α) * q[x, a] + α * target\n",
+    "\n",
+    "        # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===\n",
+    "        x = x_next\n",
+    "        if np.random.random() < ε:\n",
+    "            a = np.random.randint(0, K - x + 1)\n",
+    "        else:\n",
+    "            a = a_next\n",
+    "        ε = max(ε_min, ε * ε_decay)\n",
+    "\n",
+    "    return q, snapshots"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72492777",
+   "metadata": {},
+   "source": [
+    "The wrapper function unpacks the model and provides default hyperparameters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8cd9e8c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def q_learning_rs(model, n_steps=20_000_000, X_init=0,\n",
+    "                  ε_init=1.0, ε_min=0.01, ε_decay=0.999999,\n",
+    "                  snapshot_steps=None):\n",
+    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
+    "    K = len(x_values) - 1\n",
+    "    if snapshot_steps is None:\n",
+    "        snapshot_steps = np.array([], dtype=np.int64)\n",
+    "    return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n",
+    "                                ε_init, ε_min, ε_decay, snapshot_steps)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b095eef",
+   "metadata": {},
+   "source": [
+    "### Running Q-learning\n",
+    "\n",
+    "We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38386ce1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(1234)\n",
+    "snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)\n",
+    "q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "985c208b",
+   "metadata": {},
+   "source": [
+    "### Comparing with the exact solution\n",
+    "\n",
+    "We extract the value function and policy from the final Q-table.\n",
+    "\n",
+    "Since Q-values represent $\\mathbb{E}[\\exp(-\\gamma(\\cdots))]$, we recover the\n",
+    "value function via $v_Q(x) = -\\frac{1}{\\gamma} \\ln(\\min_{a} q(x, a))$ and the\n",
+    "policy via $\\sigma_Q(x) = \\arg\\min_a q(x, a)$."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dccdb8f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "K = len(x_values) - 1\n",
+    "γ_base = model.γ\n",
+    "# restrict to feasible actions a ∈ {0, ..., K-x}\n",
+    "v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))\n",
+    "                for x in range(K + 1)])\n",
+    "σ_q = np.array([np.argmin(q_table[x, :K - x + 1])\n",
+    "                for x in range(K + 1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a855f91",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n",
+    "\n",
+    "axes[0].plot(x_values, v_star, label=\"VFI\")\n",
+    "axes[0].plot(x_values, v_q, '--', label=\"Q-learning\")\n",
+    "axes[0].set_xlabel(\"inventory\")\n",
+    "axes[0].set_ylabel(\"value\")\n",
+    "axes[0].legend()\n",
+    "axes[0].set_title(\"Value function\")\n",
+    "\n",
+    "axes[1].plot(x_values, σ_star, label=\"VFI\")\n",
+    "axes[1].plot(x_values, σ_q, '--', label=\"Q-learning\")\n",
+    "axes[1].set_xlabel(\"inventory\")\n",
+    "axes[1].set_ylabel(\"order quantity\")\n",
+    "axes[1].legend()\n",
+    "axes[1].set_title(\"Policy\")\n",
+    "\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8579f07d",
+   "metadata": {},
+   "source": [
+    "### Visualizing learning over time\n",
+    "\n",
+    "The panels below show how the agent's policy evolves during training.\n",
+    "\n",
+    "Each panel simulates an inventory path using the greedy policy extracted from\n",
+    "the Q-table at a given training step, with the same demand sequence throughout.\n",
+    "\n",
+    "The top panel shows the optimal policy from VFI for reference."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "142d3906",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ts_length = 200\n",
+    "n_snaps = len(snap_steps)\n",
+    "fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),\n",
+    "                         sharex=True)\n",
+    "\n",
+    "X_init = K // 2\n",
+    "sim_seed = 5678\n",
+    "\n",
+    "# Optimal policy\n",
+    "np.random.seed(sim_seed)\n",
+    "X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)\n",
+    "axes[0].plot(X_opt, alpha=0.7)\n",
+    "axes[0].set_ylabel(\"inventory\")\n",
+    "axes[0].set_title(\"Optimal (VFI)\")\n",
+    "axes[0].set_ylim(0, K + 2)\n",
+    "\n",
+    "# Q-learning snapshots\n",
+    "for i in range(n_snaps):\n",
+    "    σ_snap = snapshots[i]\n",
+    "    np.random.seed(sim_seed)\n",
+    "    X = sim_inventories(ts_length, σ_snap, model.p, X_init)\n",
+    "    axes[i + 1].plot(X, alpha=0.7)\n",
+    "    axes[i + 1].set_ylabel(\"inventory\")\n",
+    "    axes[i + 1].set_title(f\"Step {snap_steps[i]:,}\")\n",
+    "    axes[i + 1].set_ylim(0, K + 2)\n",
+    "\n",
+    "axes[-1].set_xlabel(r\"$t$\")\n",
+    "plt.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "274e77d9",
+   "metadata": {},
+   "source": [
+    "After 10,000 steps the agent has barely explored and its policy is erratic.\n",
+    "\n",
+    "By 1,000,000 steps the learned policy begins to resemble the optimal one, and\n",
+    "by step 20 million the inventory dynamics are nearly indistinguishable from the\n",
+    "VFI solution.\n",
+    "\n",
+    "Note that the converged policy maintains lower inventory levels than in the\n",
+    "risk-neutral case (compare with {ref}`inventory_q`), consistent with the\n",
+    "mechanism discussed above: a risk-sensitive agent caps its exposure to demand\n",
+    "variance by holding less stock.\n",
+    "\n",
+    "## Conclusion\n",
+    "\n",
+    "We extended the inventory management problem from {ref}`inventory_q` to\n",
+    "incorporate risk sensitivity via the certainty equivalent operator\n",
+    "$\\phi^{-1}(\\mathbb{E}[\\phi(\\cdot)])$ with $\\phi(t) = \\exp(-\\gamma t)$.\n",
+    "\n",
+    "Value function iteration confirms that risk-sensitive firms order less\n",
+    "aggressively, preferring predictable profits over higher but more volatile\n",
+    "returns.\n",
+    "\n",
+    "We then showed that Q-learning can be adapted to the risk-sensitive setting by\n",
+    "working with the transformed Q-factor $q(x,a) =\n",
+    "\\mathbb{E}[\\exp(-\\gamma(\\pi + \\beta v^*))]$.\n",
+    "\n",
+    "The resulting update rule replaces addition with multiplication and max with\n",
+    "min, but retains the key property of model-free learning: the agent needs only\n",
+    "to observe states, actions, and profits."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "default_lexer": "ipython3"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
new file mode 100644
index 000000000..6a368656b
--- /dev/null
+++ b/lectures/rs_inventory_q.md
@@ -0,0 +1,761 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.18.1
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+(rs_inventory_q)=
+```{raw} jupyter
+<div id="qe-notebook-header" align="right" style="text-align:right;">
+        <a href="https://quantecon.org/" title="quantecon.org">
+                <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+        </a>
+</div>
+```
+
+# Risk-Sensitive Inventory Management via Q-Learning
+
+```{contents} Contents
+:depth: 2
+```
+
+## Introduction
+
+In {ref}`inventory_q`, we looked at an inventory management
+problem and solved it with both value function iteration and Q-learning.
+
+In this lecture, we consider a risk-sensitive variation.
+
+Injection of risk-sensitivity acknowledges the fact
+that, in incomplete markets with financial and informational frictions, firms
+typically take risk into account in their decision making.
+
+In other words, the actions of firms are not, in general, risk neutral.
+
+One natural way to handle this is to use a risk-sensitive version of the Bellman
+equation.
+
+We show how the model can be solved using value function iteration.
+
+We then investigate how risk sensitivity affects the optimal policy.
+
+
+
+```{code-cell} ipython3
+import numpy as np
+import numba
+import matplotlib.pyplot as plt
+from typing import NamedTuple
+```
+
+
+## The Model
+
+The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form
+
+
+$$
+    v(x)
+    = \max_{a \in \Gamma(x)} \mathbb E
+    \left[
+        \pi(x, a, D)
+        + \beta  v(h(x, a, D))
+    \right].
+$$
+
+Here $D$ is a random variable with distribution $\phi$.
+
+(Primitives and definitions are the same as in {ref}`inventory_q`.)
+
+The risk-sensitive version of this Bellman equation has the form
+
+$$
+    v(x)
+    = \max_{a \in \Gamma(x)}
+    \phi^{-1}
+    \left\{
+        \mathbb E \phi
+        \left[
+            \pi(x, a, D)
+            + \beta  v(h(x, a, D))
+        \right]
+    \right\},
+$$
+
+where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$.
+
+Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes
+
+$$
+    v(x)
+    = \max_{a \in \Gamma(x)}
+    \left(
+        -\frac{1}{\gamma}
+    \right)
+    \ln
+    \left\{
+        \sum_d \phi(d) \exp
+        \left[
+            -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right)
+        \right]
+    \right\}.
+$$
+
+Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.
+
+The parameter $\gamma$ controls the degree of risk sensitivity.
+
+As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.
+
+Larger $\gamma$ means more aversion to downside risk.
+
+The Bellman operator, greedy policy, and VFI algorithm all carry over from the
+risk-neutral case, with the expectation replaced by the certainty equivalent.
+
+
+
+## Solving via Value Function Iteration
+
+### Model specification
+
+We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter.
+
+```{code-cell} ipython3
+class RSModel(NamedTuple):
+    x_values: np.ndarray        # Inventory values
+    d_values: np.ndarray        # Demand values for summation
+    ϕ_values: np.ndarray        # Demand probabilities
+    p: float                    # Demand parameter
+    c: float                    # Unit cost
+    κ: float                    # Fixed cost
+    β: float                    # Discount factor
+    γ: float                    # Risk-sensitivity parameter
+```
+
+```{code-cell} ipython3
+def create_rs_inventory_model(
+        K: int = 20,         # Max inventory
+        D_MAX: int = 21,     # Demand upper bound for summation
+        p: float = 0.7,
+        c: float = 0.2,
+        κ: float = 0.8,
+        β: float = 0.98,
+        γ: float = 1.0
+    ) -> RSModel:
+
+    def demand_pdf(p, d):
+        return (1 - p)**d * p
+
+    d_values = np.arange(D_MAX)
+    ϕ_values = demand_pdf(p, d_values)
+    x_values = np.arange(K + 1)
+
+    return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)
+```
+
+### The Bellman operator
+
+The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.
+
+For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute
+
+$$
+    -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i)
+    \;=\;
+    -\frac{1}{\gamma}
+    \left(
+        m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m)
+    \right),
+$$
+
+where $m = \max_i (-\gamma z_i)$.
+
+```{code-cell} ipython3
+@numba.jit(nopython=True)
+def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
+    new_v = np.empty(K + 1)
+    n_d = len(d_values)
+    for x in range(K + 1):
+        best = -np.inf
+        for a in range(K - x + 1):
+            # Compute -γ * z_i for each demand realization
+            exponents = np.empty(n_d)
+            for i in range(n_d):
+                d = d_values[i]
+                x_next = max(x - d, 0) + a
+                revenue = min(x, d)
+                cost = c * a + κ * (a > 0)
+                z_i = revenue - cost + β * v[x_next]
+                exponents[i] = -γ * z_i
+            # Log-sum-exp trick for numerical stability
+            m = np.max(exponents)
+            weighted_sum = 0.0
+            for i in range(n_d):
+                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
+            val = -(1.0 / γ) * (m + np.log(weighted_sum))
+            if val > best:
+                best = val
+        new_v[x] = best
+    return new_v
+```
+
+```{code-cell} ipython3
+def T_rs(v, model):
+    """The risk-sensitive Bellman operator."""
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
+```
+
+
+### Computing the greedy policy
+
+The greedy policy records the maximizing action instead of the maximized value.
+
+```{code-cell} ipython3
+@numba.jit(nopython=True)
+def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
+    σ = np.empty(K + 1, dtype=np.int32)
+    n_d = len(d_values)
+    for x in range(K + 1):
+        best = -np.inf
+        best_a = 0
+        for a in range(K - x + 1):
+            exponents = np.empty(n_d)
+            for i in range(n_d):
+                d = d_values[i]
+                x_next = max(x - d, 0) + a
+                revenue = min(x, d)
+                cost = c * a + κ * (a > 0)
+                z_i = revenue - cost + β * v[x_next]
+                exponents[i] = -γ * z_i
+            m = np.max(exponents)
+            weighted_sum = 0.0
+            for i in range(n_d):
+                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
+            val = -(1.0 / γ) * (m + np.log(weighted_sum))
+            if val > best:
+                best = val
+                best_a = a
+        σ[x] = best_a
+    return σ
+```
+
+```{code-cell} ipython3
+def get_greedy_rs(v, model):
+    """Get a v-greedy policy for the risk-sensitive model."""
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
+```
+
+### Value function iteration
+
+```{code-cell} ipython3
+def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):
+    v = v_init.copy()
+    i, error = 0, tol + 1
+
+    while i < max_iter and error > tol:
+        new_v = T_rs(v, model)
+        error = np.max(np.abs(new_v - v))
+        i += 1
+        v = new_v
+
+    print(f"Converged in {i} iterations with error {error:.2e}")
+
+    σ = get_greedy_rs(v, model)
+    return v, σ
+```
+
+### Creating and solving an instance
+
+```{code-cell} ipython3
+model = create_rs_inventory_model()
+x_values = model.x_values
+n_x = len(x_values)
+v_init = np.zeros(n_x)
+```
+
+```{code-cell} ipython3
+v_star, σ_star = solve_rs_inventory_model(v_init, model)
+```
+
+### Effect of risk sensitivity on the optimal policy
+
+We solve the model for several values of $\gamma$ and compare the resulting policies.
+
+As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one.
+
+```{code-cell} ipython3
+γ_values = [0.01, 1.0, 2.0]
+results = {}
+
+for γ in γ_values:
+    mod = create_rs_inventory_model(γ=γ)
+    v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)
+    results[γ] = (v, σ)
+```
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
+
+for γ in γ_values:
+    v, σ = results[γ]
+    axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$")
+    axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$")
+
+axes[0].set_xlabel("inventory")
+axes[0].set_ylabel("value")
+axes[0].legend()
+axes[0].set_title("Value function")
+
+axes[1].set_xlabel("inventory")
+axes[1].set_ylabel("order quantity")
+axes[1].legend()
+axes[1].set_title("Policy")
+
+plt.tight_layout()
+plt.show()
+```
+
+### Simulating the optimal policy
+
+We simulate inventory dynamics under the optimal policy for the baseline $\gamma$.
+
+```{code-cell} ipython3
+@numba.jit(nopython=True)
+def sim_inventories(ts_length, σ, p, X_init=0):
+    """Simulate inventory dynamics under policy σ."""
+    X = np.zeros(ts_length, dtype=np.int32)
+    X[0] = X_init
+    for t in range(ts_length - 1):
+        d = np.random.geometric(p) - 1
+        X[t+1] = max(X[t] - d, 0) + σ[X[t]]
+    return X
+```
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(len(γ_values), 1,
+                         figsize=(8, 2.0 * len(γ_values)),
+                         sharex=True)
+
+ts_length = 200
+sim_seed = 5678
+K = len(x_values) - 1
+
+for i, γ in enumerate(γ_values):
+    v, σ = results[γ]
+    np.random.seed(sim_seed)
+    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)
+    axes[i].plot(X, alpha=0.7)
+    axes[i].set_ylabel("inventory")
+    axes[i].set_title(f"$\\gamma = {γ}$")
+    axes[i].set_ylim(0, K + 2)
+
+axes[-1].set_xlabel(r"$t$")
+plt.tight_layout()
+plt.show()
+```
+
+## Interpreting the Outcomes
+
+The plots above show that a more risk-sensitive firm (larger $\gamma$) orders
+less inventory and maintains lower stock levels.
+
+At first glance this may seem surprising: wouldn't holding more inventory
+reduce variance by ensuring the firm can always meet demand?
+
+The key is to identify where the randomness in profits actually comes from.
+
+Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa
+\mathbf{1}\{a > 0\}$.
+
+The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it
+is chosen before the demand shock is realized.
+
+So higher ordering shifts the level of profits down but does not affect their
+variance.
+
+The variance comes from **revenue**: $\min(x, D)$.
+
+When inventory $x$ is high, $\min(x, D) \approx D$ for most demand
+realizations — revenue inherits the full variance of demand.
+
+When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations —
+revenue is nearly deterministic, capped at the inventory level.
+
+A risk-sensitive agent therefore prefers lower inventory because it **caps the
+randomness of revenue**.
+
+The agent accepts lower expected sales in exchange for more predictable profits.
+
+There is also a continuation value channel: next-period inventory $\max(x - D,
+0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more
+closely, propagating that variance forward through $v$.
+
+
+## Q-Learning
+
+We now ask whether the optimal policy can be learned without knowledge of the
+model, as we did in the risk-neutral case in {ref}`inventory_q`.
+
+### The Q-factor
+
+The first step is to define the Q-factor in a way that is compatible with the
+risk-sensitive Bellman equation.
+
+We define
+
+$$
+    q(x, a) := \mathbb E
+    \left[
+        \exp\!\left(
+            -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right)
+        \right)
+    \right].
+$$
+
+In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) =
+\exp(-\gamma t)$ inside the expectation, evaluated at the return from taking
+action $a$ in state $x$ and following the optimal policy thereafter.
+
+### Deriving the Q-factor Bellman equation
+
+Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.
+
+**Step 1.** Express $v^*$ in terms of $q$.
+
+The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)}
+\phi^{-1}(q(x, a))$.
+
+Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function,
+the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum**
+over $a$ of $q(x, a)$:
+
+$$
+    v^*(x)
+    = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right)
+    = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right).
+$$
+
+Equivalently,
+
+$$
+    \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a).
+$$
+
+**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.
+
+Expanding the exponential in the definition of $q$,
+
+$$
+    q(x, a)
+    = \mathbb E
+    \left[
+        \exp(-\gamma \, \pi(x, a, D))
+        \;\cdot\;
+        \exp\!\left(-\gamma \beta \, v^*(x')\right)
+    \right]
+$$
+
+where $x' = h(x, a, D)$.
+
+From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$,
+so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x',
+a')\right]^\beta$.
+
+Substituting,
+
+$$
+    q(x, a)
+    = \mathbb E
+    \left[
+        \exp(-\gamma \, \pi(x, a, D))
+        \;\cdot\;
+        \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta
+    \right].
+$$
+
+This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.
+
+### The Q-learning update rule
+
+As in the risk-neutral case, we approximate the fixed point using stochastic
+approximation.
+
+At each step, the agent is in state $x$, takes action $a$, observes profit
+$R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,
+and updates
+
+$$
+    q_{t+1}(x, a)
+    = (1 - \alpha_t) \, q_t(x, a)
+    + \alpha_t
+    \left[
+        \exp(-\gamma \, R_{t+1})
+        \;\cdot\;
+        \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta
+    \right].
+$$
+
+The term in brackets is a single-sample estimate of the right-hand side of the
+Q-factor Bellman equation.
+
+The update blends the current estimate with this fresh sample, just as in
+standard Q-learning.
+
+Notice several differences from the risk-neutral case:
+
+- The Q-values are **positive** (expectations of exponentials) rather than signed.
+- The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize**
+  rather than maximize, because $\phi^{-1}$ is decreasing.
+- The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
+  additively.
+- The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather
+  than a scaled sum $\beta \cdot \max_{a'} q_t$.
+
+As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and
+$X_{t+1}$ — no model knowledge is required.
+
+### Implementation plan
+
+Our implementation follows the same structure as the risk-neutral Q-learning in
+{ref}`inventory_q`, with the modifications above:
+
+1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and
+   visit counts $n$ to zeros.
+2. **At each step:**
+   - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state
+     $X_{t+1}$.
+   - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a
+     scalar for the update target, and the $\arg\min$ action is used by the
+     $\varepsilon$-greedy behavior policy).
+   - Update $q_t(x, a)$ using the rule above, with learning rate
+     $\alpha_t = 1 / n_t(x, a)^{0.51}$.
+   - Choose the next action via $\varepsilon$-greedy: with probability
+     $\varepsilon$ pick a random feasible action, otherwise pick the
+     $\arg\min$ action.
+   - Decay $\varepsilon$.
+3. **Extract the greedy policy** from the final Q-table via
+   $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$.
+4. **Compare** the learned policy against the VFI solution.
+
+### Implementation
+
+We first define a helper to extract the greedy policy from the Q-table.
+
+Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$.
+
+```{code-cell} ipython3
+@numba.jit(nopython=True)
+def greedy_policy_from_q_rs(q, K):
+    """Extract greedy policy from risk-sensitive Q-table (argmin)."""
+    σ = np.empty(K + 1, dtype=np.int32)
+    for x in range(K + 1):
+        best_val = np.inf
+        best_a = 0
+        for a in range(K - x + 1):
+            if q[x, a] < best_val:
+                best_val = q[x, a]
+                best_a = a
+        σ[x] = best_a
+    return σ
+```
+
+The Q-learning loop mirrors the risk-neutral version, with the key changes:
+Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1})
+\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$.
+
+```{code-cell} ipython3
+@numba.jit(nopython=True)
+def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
+                         ε_init, ε_min, ε_decay, snapshot_steps):
+    q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1
+    n = np.zeros((K + 1, K + 1))        # visit counts for learning rate
+    ε = ε_init
+
+    n_snaps = len(snapshot_steps)
+    snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)
+    snap_idx = 0
+
+    # Initialize state and action
+    x = X_init
+    a = np.random.randint(0, K - x + 1)
+
+    for t in range(n_steps):
+        # Record policy snapshot if needed
+        if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:
+            snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)
+            snap_idx += 1
+
+        # === Draw D_{t+1} and observe outcome ===
+        d = np.random.geometric(p) - 1
+        reward = min(x, d) - c * a - κ * (a > 0)
+        x_next = max(x - d, 0) + a
+
+        # === Min over next state (scalar value for update target) ===
+        # Also record the argmin action for use by the behavior policy.
+        best_next = np.inf
+        a_next = 0
+        for aa in range(K - x_next + 1):
+            if q[x_next, aa] < best_next:
+                best_next = q[x_next, aa]
+                a_next = aa
+
+        # === Risk-sensitive Q-learning update ===
+        target = np.exp(-γ * reward) * best_next ** β
+        n[x, a] += 1
+        α = 1.0 / n[x, a] ** 0.51
+        q[x, a] = (1 - α) * q[x, a] + α * target
+
+        # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===
+        x = x_next
+        if np.random.random() < ε:
+            a = np.random.randint(0, K - x + 1)
+        else:
+            a = a_next
+        ε = max(ε_min, ε * ε_decay)
+
+    return q, snapshots
+```
+
+The wrapper function unpacks the model and provides default hyperparameters.
+
+```{code-cell} ipython3
+def q_learning_rs(model, n_steps=20_000_000, X_init=0,
+                  ε_init=1.0, ε_min=0.01, ε_decay=0.999999,
+                  snapshot_steps=None):
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    if snapshot_steps is None:
+        snapshot_steps = np.array([], dtype=np.int64)
+    return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
+                                ε_init, ε_min, ε_decay, snapshot_steps)
+```
+
+### Running Q-learning
+
+We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end.
+
+```{code-cell} ipython3
+np.random.seed(1234)
+snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)
+q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)
+```
+
+### Comparing with the exact solution
+
+We extract the value function and policy from the final Q-table.
+
+Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the
+value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the
+policy via $\sigma_Q(x) = \arg\min_a q(x, a)$.
+
+```{code-cell} ipython3
+K = len(x_values) - 1
+γ_base = model.γ
+# restrict to feasible actions a ∈ {0, ..., K-x}
+v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))
+                for x in range(K + 1)])
+σ_q = np.array([np.argmin(q_table[x, :K - x + 1])
+                for x in range(K + 1)])
+```
+
+```{code-cell} ipython3
+fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
+
+axes[0].plot(x_values, v_star, label="VFI")
+axes[0].plot(x_values, v_q, '--', label="Q-learning")
+axes[0].set_xlabel("inventory")
+axes[0].set_ylabel("value")
+axes[0].legend()
+axes[0].set_title("Value function")
+
+axes[1].plot(x_values, σ_star, label="VFI")
+axes[1].plot(x_values, σ_q, '--', label="Q-learning")
+axes[1].set_xlabel("inventory")
+axes[1].set_ylabel("order quantity")
+axes[1].legend()
+axes[1].set_title("Policy")
+
+plt.tight_layout()
+plt.show()
+```
+
+### Visualizing learning over time
+
+The panels below show how the agent's policy evolves during training.
+
+Each panel simulates an inventory path using the greedy policy extracted from
+the Q-table at a given training step, with the same demand sequence throughout.
+
+The top panel shows the optimal policy from VFI for reference.
+
+```{code-cell} ipython3
+ts_length = 200
+n_snaps = len(snap_steps)
+fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),
+                         sharex=True)
+
+X_init = K // 2
+sim_seed = 5678
+
+# Optimal policy
+np.random.seed(sim_seed)
+X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)
+axes[0].plot(X_opt, alpha=0.7)
+axes[0].set_ylabel("inventory")
+axes[0].set_title("Optimal (VFI)")
+axes[0].set_ylim(0, K + 2)
+
+# Q-learning snapshots
+for i in range(n_snaps):
+    σ_snap = snapshots[i]
+    np.random.seed(sim_seed)
+    X = sim_inventories(ts_length, σ_snap, model.p, X_init)
+    axes[i + 1].plot(X, alpha=0.7)
+    axes[i + 1].set_ylabel("inventory")
+    axes[i + 1].set_title(f"Step {snap_steps[i]:,}")
+    axes[i + 1].set_ylim(0, K + 2)
+
+axes[-1].set_xlabel(r"$t$")
+plt.tight_layout()
+plt.show()
+```
+
+After 10,000 steps the agent has barely explored and its policy is erratic.
+
+By 1,000,000 steps the learned policy begins to resemble the optimal one, and
+by step 20 million the inventory dynamics are nearly indistinguishable from the
+VFI solution.
+
+Note that the converged policy maintains lower inventory levels than in the
+risk-neutral case (compare with {ref}`inventory_q`), consistent with the
+mechanism discussed above: a risk-sensitive agent caps its exposure to demand
+variance by holding less stock.
+
+## Conclusion
+
+We extended the inventory management problem from {ref}`inventory_q` to
+incorporate risk sensitivity via the certainty equivalent operator
+$\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$.
+
+Value function iteration confirms that risk-sensitive firms order less
+aggressively, preferring predictable profits over higher but more volatile
+returns.
+
+We then showed that Q-learning can be adapted to the risk-sensitive setting by
+working with the transformed Q-factor $q(x,a) =
+\mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$.
+
+The resulting update rule replaces addition with multiplication and max with
+min, but retains the key property of model-free learning: the agent needs only
+to observe states, actions, and profits.
diff --git a/lectures/rs_inventory_q.py b/lectures/rs_inventory_q.py
new file mode 100644
index 000000000..82a1ba74f
--- /dev/null
+++ b/lectures/rs_inventory_q.py
@@ -0,0 +1,767 @@
+# ---
+# jupyter:
+#   jupytext:
+#     default_lexer: ipython3
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.18.1
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# (rs_inventory_q)=
+# ```{raw} jupyter
+# <div id="qe-notebook-header" align="right" style="text-align:right;">
+#         <a href="https://quantecon.org/" title="quantecon.org">
+#                 <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
+#         </a>
+# </div>
+# ```
+#
+# # Risk-Sensitive Inventory Management via Q-Learning
+#
+# ```{contents} Contents
+# :depth: 2
+# ```
+#
+# ## Introduction
+#
+# In {ref}`inventory_q`, we looked at an inventory management
+# problem and solved it with both value function iteration and Q-learning.
+#
+# In this lecture, we consider a risk-sensitive variation.
+#
+# Injection of risk-sensitivity acknowledges the fact
+# that, in incomplete markets with financial and informational frictions, firms
+# typically take risk into account in their decision making.
+#
+# In other words, the actions of firms are not, in general, risk neutral.
+#
+# One natural way to handle this is to use a risk-sensitive version of the Bellman
+# equation.
+#
+# We show how the model can be solved using value function iteration.
+#
+# We then investigate how risk sensitivity affects the optimal policy.
+
+# %%
+import numpy as np
+import numba
+import matplotlib.pyplot as plt
+from typing import NamedTuple
+
+
+# %% [markdown]
+# ## The Model
+#
+# The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form
+#
+#
+# $$
+#     v(x)
+#     = \max_{a \in \Gamma(x)} \mathbb E
+#     \left[
+#         \pi(x, a, D)
+#         + \beta  v(h(x, a, D))
+#     \right].
+# $$
+#
+# Here $D$ is a random variable with distribution $\phi$.
+#
+# (Primitives and definitions are the same as in {ref}`inventory_q`.)
+#
+# The risk-sensitive version of this Bellman equation has the form
+#
+# $$
+#     v(x)
+#     = \max_{a \in \Gamma(x)}
+#     \phi^{-1}
+#     \left\{
+#         \mathbb E \phi
+#         \left[
+#             \pi(x, a, D)
+#             + \beta  v(h(x, a, D))
+#         \right]
+#     \right\},
+# $$
+#
+# where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$.
+#
+# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes
+#
+# $$
+#     v(x)
+#     = \max_{a \in \Gamma(x)}
+#     \left(
+#         -\frac{1}{\gamma}
+#     \right)
+#     \ln
+#     \left\{
+#         \sum_d \phi(d) \exp
+#         \left[
+#             -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right)
+#         \right]
+#     \right\}.
+# $$
+#
+# Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.
+#
+# The parameter $\gamma$ controls the degree of risk sensitivity.
+#
+# As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.
+#
+# Larger $\gamma$ means more aversion to downside risk.
+#
+# The Bellman operator, greedy policy, and VFI algorithm all carry over from the
+# risk-neutral case, with the expectation replaced by the certainty equivalent.
+#
+#
+#
+# ## Solving via Value Function Iteration
+#
+# ### Model specification
+#
+# We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter.
+
+# %%
+class RSModel(NamedTuple):
+    x_values: np.ndarray        # Inventory values
+    d_values: np.ndarray        # Demand values for summation
+    ϕ_values: np.ndarray        # Demand probabilities
+    p: float                    # Demand parameter
+    c: float                    # Unit cost
+    κ: float                    # Fixed cost
+    β: float                    # Discount factor
+    γ: float                    # Risk-sensitivity parameter
+
+
+# %%
+def create_rs_inventory_model(
+        K: int = 20,         # Max inventory
+        D_MAX: int = 21,     # Demand upper bound for summation
+        p: float = 0.7,
+        c: float = 0.2,
+        κ: float = 0.8,
+        β: float = 0.98,
+        γ: float = 1.0
+    ) -> RSModel:
+
+    def demand_pdf(p, d):
+        return (1 - p)**d * p
+
+    d_values = np.arange(D_MAX)
+    ϕ_values = demand_pdf(p, d_values)
+    x_values = np.arange(K + 1)
+
+    return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)
+
+
+# %% [markdown]
+# ### The Bellman operator
+#
+# The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.
+#
+# For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute
+#
+# $$
+#     -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i)
+#     \;=\;
+#     -\frac{1}{\gamma}
+#     \left(
+#         m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m)
+#     \right),
+# $$
+#
+# where $m = \max_i (-\gamma z_i)$.
+
+# %%
+@numba.jit(nopython=True)
+def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
+    new_v = np.empty(K + 1)
+    n_d = len(d_values)
+    for x in range(K + 1):
+        best = -np.inf
+        for a in range(K - x + 1):
+            # Compute -γ * z_i for each demand realization
+            exponents = np.empty(n_d)
+            for i in range(n_d):
+                d = d_values[i]
+                x_next = max(x - d, 0) + a
+                revenue = min(x, d)
+                cost = c * a + κ * (a > 0)
+                z_i = revenue - cost + β * v[x_next]
+                exponents[i] = -γ * z_i
+            # Log-sum-exp trick for numerical stability
+            m = np.max(exponents)
+            weighted_sum = 0.0
+            for i in range(n_d):
+                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
+            val = -(1.0 / γ) * (m + np.log(weighted_sum))
+            if val > best:
+                best = val
+        new_v[x] = best
+    return new_v
+
+
+# %%
+def T_rs(v, model):
+    """The risk-sensitive Bellman operator."""
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
+
+
+# %% [markdown]
+# ### Computing the greedy policy
+#
+# The greedy policy records the maximizing action instead of the maximized value.
+
+# %%
+@numba.jit(nopython=True)
+def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
+    σ = np.empty(K + 1, dtype=np.int32)
+    n_d = len(d_values)
+    for x in range(K + 1):
+        best = -np.inf
+        best_a = 0
+        for a in range(K - x + 1):
+            exponents = np.empty(n_d)
+            for i in range(n_d):
+                d = d_values[i]
+                x_next = max(x - d, 0) + a
+                revenue = min(x, d)
+                cost = c * a + κ * (a > 0)
+                z_i = revenue - cost + β * v[x_next]
+                exponents[i] = -γ * z_i
+            m = np.max(exponents)
+            weighted_sum = 0.0
+            for i in range(n_d):
+                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
+            val = -(1.0 / γ) * (m + np.log(weighted_sum))
+            if val > best:
+                best = val
+                best_a = a
+        σ[x] = best_a
+    return σ
+
+
+# %%
+def get_greedy_rs(v, model):
+    """Get a v-greedy policy for the risk-sensitive model."""
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
+
+
+# %% [markdown]
+# ### Value function iteration
+
+# %%
+def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):
+    v = v_init.copy()
+    i, error = 0, tol + 1
+
+    while i < max_iter and error > tol:
+        new_v = T_rs(v, model)
+        error = np.max(np.abs(new_v - v))
+        i += 1
+        v = new_v
+
+    print(f"Converged in {i} iterations with error {error:.2e}")
+
+    σ = get_greedy_rs(v, model)
+    return v, σ
+
+
+# %% [markdown]
+# ### Creating and solving an instance
+
+# %%
+model = create_rs_inventory_model()
+x_values = model.x_values
+n_x = len(x_values)
+v_init = np.zeros(n_x)
+
+# %%
+v_star, σ_star = solve_rs_inventory_model(v_init, model)
+
+# %% [markdown]
+# ### Effect of risk sensitivity on the optimal policy
+#
+# We solve the model for several values of $\gamma$ and compare the resulting policies.
+#
+# As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one.
+
+# %%
+γ_values = [0.01, 1.0, 2.0]
+results = {}
+
+for γ in γ_values:
+    mod = create_rs_inventory_model(γ=γ)
+    v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)
+    results[γ] = (v, σ)
+
+# %%
+fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
+
+for γ in γ_values:
+    v, σ = results[γ]
+    axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$")
+    axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$")
+
+axes[0].set_xlabel("inventory")
+axes[0].set_ylabel("value")
+axes[0].legend()
+axes[0].set_title("Value function")
+
+axes[1].set_xlabel("inventory")
+axes[1].set_ylabel("order quantity")
+axes[1].legend()
+axes[1].set_title("Policy")
+
+plt.tight_layout()
+plt.show()
+
+
+# %% [markdown]
+# ### Simulating the optimal policy
+#
+# We simulate inventory dynamics under the optimal policy for the baseline $\gamma$.
+
+# %%
+@numba.jit(nopython=True)
+def sim_inventories(ts_length, σ, p, X_init=0):
+    """Simulate inventory dynamics under policy σ."""
+    X = np.zeros(ts_length, dtype=np.int32)
+    X[0] = X_init
+    for t in range(ts_length - 1):
+        d = np.random.geometric(p) - 1
+        X[t+1] = max(X[t] - d, 0) + σ[X[t]]
+    return X
+
+
+# %%
+fig, axes = plt.subplots(len(γ_values), 1,
+                         figsize=(8, 2.0 * len(γ_values)),
+                         sharex=True)
+
+ts_length = 200
+sim_seed = 5678
+K = len(x_values) - 1
+
+for i, γ in enumerate(γ_values):
+    v, σ = results[γ]
+    np.random.seed(sim_seed)
+    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)
+    axes[i].plot(X, alpha=0.7)
+    axes[i].set_ylabel("inventory")
+    axes[i].set_title(f"$\\gamma = {γ}$")
+    axes[i].set_ylim(0, K + 2)
+
+axes[-1].set_xlabel(r"$t$")
+plt.tight_layout()
+plt.show()
+
+
+# %% [markdown]
+# ## Interpreting the Outcomes
+#
+# The plots above show that a more risk-sensitive firm (larger $\gamma$) orders
+# less inventory and maintains lower stock levels.
+#
+# At first glance this may seem surprising: wouldn't holding more inventory
+# reduce variance by ensuring the firm can always meet demand?
+#
+# The key is to identify where the randomness in profits actually comes from.
+#
+# Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa
+# \mathbf{1}\{a > 0\}$.
+#
+# The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it
+# is chosen before the demand shock is realized.
+#
+# So higher ordering shifts the level of profits down but does not affect their
+# variance.
+#
+# The variance comes from **revenue**: $\min(x, D)$.
+#
+# When inventory $x$ is high, $\min(x, D) \approx D$ for most demand
+# realizations — revenue inherits the full variance of demand.
+#
+# When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations —
+# revenue is nearly deterministic, capped at the inventory level.
+#
+# A risk-sensitive agent therefore prefers lower inventory because it **caps the
+# randomness of revenue**.
+#
+# The agent accepts lower expected sales in exchange for more predictable profits.
+#
+# There is also a continuation value channel: next-period inventory $\max(x - D,
+# 0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more
+# closely, propagating that variance forward through $v$.
+#
+#
+# ## Q-Learning
+#
+# We now ask whether the optimal policy can be learned without knowledge of the
+# model, as we did in the risk-neutral case in {ref}`inventory_q`.
+#
+# ### The Q-factor
+#
+# The first step is to define the Q-factor in a way that is compatible with the
+# risk-sensitive Bellman equation.
+#
+# We define
+#
+# $$
+#     q(x, a) := \mathbb E
+#     \left[
+#         \exp\!\left(
+#             -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right)
+#         \right)
+#     \right].
+# $$
+#
+# In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) =
+# \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking
+# action $a$ in state $x$ and following the optimal policy thereafter.
+#
+# ### Deriving the Q-factor Bellman equation
+#
+# Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.
+#
+# **Step 1.** Express $v^*$ in terms of $q$.
+#
+# The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)}
+# \phi^{-1}(q(x, a))$.
+#
+# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function,
+# the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum**
+# over $a$ of $q(x, a)$:
+#
+# $$
+#     v^*(x)
+#     = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right)
+#     = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right).
+# $$
+#
+# Equivalently,
+#
+# $$
+#     \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a).
+# $$
+#
+# **Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.
+#
+# Expanding the exponential in the definition of $q$,
+#
+# $$
+#     q(x, a)
+#     = \mathbb E
+#     \left[
+#         \exp(-\gamma \, \pi(x, a, D))
+#         \;\cdot\;
+#         \exp\!\left(-\gamma \beta \, v^*(x')\right)
+#     \right]
+# $$
+#
+# where $x' = h(x, a, D)$.
+#
+# From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$,
+# so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x',
+# a')\right]^\beta$.
+#
+# Substituting,
+#
+# $$
+#     q(x, a)
+#     = \mathbb E
+#     \left[
+#         \exp(-\gamma \, \pi(x, a, D))
+#         \;\cdot\;
+#         \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta
+#     \right].
+# $$
+#
+# This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.
+#
+# ### The Q-learning update rule
+#
+# As in the risk-neutral case, we approximate the fixed point using stochastic
+# approximation.
+#
+# At each step, the agent is in state $x$, takes action $a$, observes profit
+# $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,
+# and updates
+#
+# $$
+#     q_{t+1}(x, a)
+#     = (1 - \alpha_t) \, q_t(x, a)
+#     + \alpha_t
+#     \left[
+#         \exp(-\gamma \, R_{t+1})
+#         \;\cdot\;
+#         \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta
+#     \right].
+# $$
+#
+# The term in brackets is a single-sample estimate of the right-hand side of the
+# Q-factor Bellman equation.
+#
+# The update blends the current estimate with this fresh sample, just as in
+# standard Q-learning.
+#
+# Notice several differences from the risk-neutral case:
+#
+# - The Q-values are **positive** (expectations of exponentials) rather than signed.
+# - The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize**
+#   rather than maximize, because $\phi^{-1}$ is decreasing.
+# - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
+#   additively.
+# - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather
+#   than a scaled sum $\beta \cdot \max_{a'} q_t$.
+#
+# As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and
+# $X_{t+1}$ — no model knowledge is required.
+#
+# ### Implementation plan
+#
+# Our implementation follows the same structure as the risk-neutral Q-learning in
+# {ref}`inventory_q`, with the modifications above:
+#
+# 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and
+#    visit counts $n$ to zeros.
+# 2. **At each step:**
+#    - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state
+#      $X_{t+1}$.
+#    - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a
+#      scalar for the update target, and the $\arg\min$ action is used by the
+#      $\varepsilon$-greedy behavior policy).
+#    - Update $q_t(x, a)$ using the rule above, with learning rate
+#      $\alpha_t = 1 / n_t(x, a)^{0.51}$.
+#    - Choose the next action via $\varepsilon$-greedy: with probability
+#      $\varepsilon$ pick a random feasible action, otherwise pick the
+#      $\arg\min$ action.
+#    - Decay $\varepsilon$.
+# 3. **Extract the greedy policy** from the final Q-table via
+#    $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$.
+# 4. **Compare** the learned policy against the VFI solution.
+#
+# ### Implementation
+#
+# We first define a helper to extract the greedy policy from the Q-table.
+#
+# Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$.
+
+# %%
+@numba.jit(nopython=True)
+def greedy_policy_from_q_rs(q, K):
+    """Extract greedy policy from risk-sensitive Q-table (argmin)."""
+    σ = np.empty(K + 1, dtype=np.int32)
+    for x in range(K + 1):
+        best_val = np.inf
+        best_a = 0
+        for a in range(K - x + 1):
+            if q[x, a] < best_val:
+                best_val = q[x, a]
+                best_a = a
+        σ[x] = best_a
+    return σ
+
+
+# %% [markdown]
+# The Q-learning loop mirrors the risk-neutral version, with the key changes:
+# Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1})
+# \cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$.
+
+# %%
+@numba.jit(nopython=True)
+def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
+                         ε_init, ε_min, ε_decay, snapshot_steps):
+    q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1
+    n = np.zeros((K + 1, K + 1))        # visit counts for learning rate
+    ε = ε_init
+
+    n_snaps = len(snapshot_steps)
+    snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)
+    snap_idx = 0
+
+    # Initialize state and action
+    x = X_init
+    a = np.random.randint(0, K - x + 1)
+
+    for t in range(n_steps):
+        # Record policy snapshot if needed
+        if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:
+            snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)
+            snap_idx += 1
+
+        # === Draw D_{t+1} and observe outcome ===
+        d = np.random.geometric(p) - 1
+        reward = min(x, d) - c * a - κ * (a > 0)
+        x_next = max(x - d, 0) + a
+
+        # === Min over next state (scalar value for update target) ===
+        # Also record the argmin action for use by the behavior policy.
+        best_next = np.inf
+        a_next = 0
+        for aa in range(K - x_next + 1):
+            if q[x_next, aa] < best_next:
+                best_next = q[x_next, aa]
+                a_next = aa
+
+        # === Risk-sensitive Q-learning update ===
+        target = np.exp(-γ * reward) * best_next ** β
+        n[x, a] += 1
+        α = 1.0 / n[x, a] ** 0.51
+        q[x, a] = (1 - α) * q[x, a] + α * target
+
+        # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===
+        x = x_next
+        if np.random.random() < ε:
+            a = np.random.randint(0, K - x + 1)
+        else:
+            a = a_next
+        ε = max(ε_min, ε * ε_decay)
+
+    return q, snapshots
+
+
+# %% [markdown]
+# The wrapper function unpacks the model and provides default hyperparameters.
+
+# %%
+def q_learning_rs(model, n_steps=20_000_000, X_init=0,
+                  ε_init=1.0, ε_min=0.01, ε_decay=0.999999,
+                  snapshot_steps=None):
+    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
+    K = len(x_values) - 1
+    if snapshot_steps is None:
+        snapshot_steps = np.array([], dtype=np.int64)
+    return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
+                                ε_init, ε_min, ε_decay, snapshot_steps)
+
+
+# %% [markdown]
+# ### Running Q-learning
+#
+# We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end.
+
+# %%
+np.random.seed(1234)
+snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)
+q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)
+
+# %% [markdown]
+# ### Comparing with the exact solution
+#
+# We extract the value function and policy from the final Q-table.
+#
+# Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the
+# value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the
+# policy via $\sigma_Q(x) = \arg\min_a q(x, a)$.
+
+# %%
+K = len(x_values) - 1
+γ_base = model.γ
+# restrict to feasible actions a ∈ {0, ..., K-x}
+v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))
+                for x in range(K + 1)])
+σ_q = np.array([np.argmin(q_table[x, :K - x + 1])
+                for x in range(K + 1)])
+
+# %%
+fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
+
+axes[0].plot(x_values, v_star, label="VFI")
+axes[0].plot(x_values, v_q, '--', label="Q-learning")
+axes[0].set_xlabel("inventory")
+axes[0].set_ylabel("value")
+axes[0].legend()
+axes[0].set_title("Value function")
+
+axes[1].plot(x_values, σ_star, label="VFI")
+axes[1].plot(x_values, σ_q, '--', label="Q-learning")
+axes[1].set_xlabel("inventory")
+axes[1].set_ylabel("order quantity")
+axes[1].legend()
+axes[1].set_title("Policy")
+
+plt.tight_layout()
+plt.show()
+
+# %% [markdown]
+# ### Visualizing learning over time
+#
+# The panels below show how the agent's policy evolves during training.
+#
+# Each panel simulates an inventory path using the greedy policy extracted from
+# the Q-table at a given training step, with the same demand sequence throughout.
+#
+# The top panel shows the optimal policy from VFI for reference.
+
+# %%
+ts_length = 200
+n_snaps = len(snap_steps)
+fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),
+                         sharex=True)
+
+X_init = K // 2
+sim_seed = 5678
+
+# Optimal policy
+np.random.seed(sim_seed)
+X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)
+axes[0].plot(X_opt, alpha=0.7)
+axes[0].set_ylabel("inventory")
+axes[0].set_title("Optimal (VFI)")
+axes[0].set_ylim(0, K + 2)
+
+# Q-learning snapshots
+for i in range(n_snaps):
+    σ_snap = snapshots[i]
+    np.random.seed(sim_seed)
+    X = sim_inventories(ts_length, σ_snap, model.p, X_init)
+    axes[i + 1].plot(X, alpha=0.7)
+    axes[i + 1].set_ylabel("inventory")
+    axes[i + 1].set_title(f"Step {snap_steps[i]:,}")
+    axes[i + 1].set_ylim(0, K + 2)
+
+axes[-1].set_xlabel(r"$t$")
+plt.tight_layout()
+plt.show()
+
+# %% [markdown]
+# After 10,000 steps the agent has barely explored and its policy is erratic.
+#
+# By 1,000,000 steps the learned policy begins to resemble the optimal one, and
+# by step 20 million the inventory dynamics are nearly indistinguishable from the
+# VFI solution.
+#
+# Note that the converged policy maintains lower inventory levels than in the
+# risk-neutral case (compare with {ref}`inventory_q`), consistent with the
+# mechanism discussed above: a risk-sensitive agent caps its exposure to demand
+# variance by holding less stock.
+#
+# ## Conclusion
+#
+# We extended the inventory management problem from {ref}`inventory_q` to
+# incorporate risk sensitivity via the certainty equivalent operator
+# $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$.
+#
+# Value function iteration confirms that risk-sensitive firms order less
+# aggressively, preferring predictable profits over higher but more volatile
+# returns.
+#
+# We then showed that Q-learning can be adapted to the risk-sensitive setting by
+# working with the transformed Q-factor $q(x,a) =
+# \mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$.
+#
+# The resulting update rule replaces addition with multiplication and max with
+# min, but retains the key property of model-free learning: the agent needs only
+# to observe states, actions, and profits.

From 8aee45af1523c5aaaf646dad6adf497971598c33 Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Mon, 9 Mar 2026 15:31:15 +1100
Subject: [PATCH 2/8] Add rs_inventory_q to table of contents

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/_toc.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lectures/_toc.yml b/lectures/_toc.yml
index cd51e010b..f43d7f3dc 100644
--- a/lectures/_toc.yml
+++ b/lectures/_toc.yml
@@ -77,6 +77,7 @@ parts:
   numbered: true
   chapters:
   - file: inventory_q
+  - file: rs_inventory_q
   - file: mccall_q
 - caption: Introduction to Optimal Savings
   numbered: true

From 9ce92390a01bc1cb719f6edc987335b036395642 Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Mon, 9 Mar 2026 15:44:05 +1100
Subject: [PATCH 3/8] Remove generated ipynb and py files from tracking

The build system generates notebooks from the .md source.
Having .ipynb and .py files in the repo causes duplicate
document warnings and cross-reference failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/rs_inventory_q.ipynb | 977 ----------------------------------
 lectures/rs_inventory_q.py    | 767 --------------------------
 2 files changed, 1744 deletions(-)
 delete mode 100644 lectures/rs_inventory_q.ipynb
 delete mode 100644 lectures/rs_inventory_q.py

diff --git a/lectures/rs_inventory_q.ipynb b/lectures/rs_inventory_q.ipynb
deleted file mode 100644
index 99dc4f676..000000000
--- a/lectures/rs_inventory_q.ipynb
+++ /dev/null
@@ -1,977 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "17be60e4",
-   "metadata": {},
-   "source": [
-    "(rs_inventory_q)=\n",
-    "```{raw} jupyter\n",
-    "<div id=\"qe-notebook-header\" align=\"right\" style=\"text-align:right;\">\n",
-    "        <a href=\"https://quantecon.org/\" title=\"quantecon.org\">\n",
-    "                <img style=\"width:250px;display:inline;\" width=\"250px\" src=\"https://assets.quantecon.org/img/qe-menubar-logo.svg\" alt=\"QuantEcon\">\n",
-    "        </a>\n",
-    "</div>\n",
-    "```\n",
-    "\n",
-    "# Risk-Sensitive Inventory Management via Q-Learning\n",
-    "\n",
-    "```{contents} Contents\n",
-    ":depth: 2\n",
-    "```\n",
-    "\n",
-    "## Introduction\n",
-    "\n",
-    "In {ref}`inventory_q`, we looked at an inventory management\n",
-    "problem and solved it with both value function iteration and Q-learning.\n",
-    "\n",
-    "In this lecture, we consider a risk-sensitive variation.\n",
-    "\n",
-    "Injection of risk-sensitivity acknowledges the fact\n",
-    "that, in incomplete markets with financial and informational frictions, firms\n",
-    "typically take risk into account in their decision making.\n",
-    "\n",
-    "In other words, the actions of firms are not, in general, risk neutral.\n",
-    "\n",
-    "One natural way to handle this is to use a risk-sensitive version of the Bellman\n",
-    "equation.\n",
-    "\n",
-    "We show how the model can be solved using value function iteration.\n",
-    "\n",
-    "We then investigate how risk sensitivity affects the optimal policy."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "95f2dc54",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import numba\n",
-    "import matplotlib.pyplot as plt\n",
-    "from typing import NamedTuple"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "acc004dc",
-   "metadata": {},
-   "source": [
-    "## The Model\n",
-    "\n",
-    "The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form\n",
-    "\n",
-    "\n",
-    "$$\n",
-    "    v(x)\n",
-    "    = \\max_{a \\in \\Gamma(x)} \\mathbb E\n",
-    "    \\left[\n",
-    "        \\pi(x, a, D)\n",
-    "        + \\beta  v(h(x, a, D))\n",
-    "    \\right].\n",
-    "$$\n",
-    "\n",
-    "Here $D$ is a random variable with distribution $\\phi$.\n",
-    "\n",
-    "(Primitives and definitions are the same as in {ref}`inventory_q`.)\n",
-    "\n",
-    "The risk-sensitive version of this Bellman equation has the form\n",
-    "\n",
-    "$$\n",
-    "    v(x)\n",
-    "    = \\max_{a \\in \\Gamma(x)}\n",
-    "    \\phi^{-1}\n",
-    "    \\left\\{\n",
-    "        \\mathbb E \\phi\n",
-    "        \\left[\n",
-    "            \\pi(x, a, D)\n",
-    "            + \\beta  v(h(x, a, D))\n",
-    "        \\right]\n",
-    "    \\right\\},\n",
-    "$$\n",
-    "\n",
-    "where $\\phi(t) = \\exp(-\\gamma t)$ for fixed $\\gamma > 0$.\n",
-    "\n",
-    "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$, the Bellman equation becomes\n",
-    "\n",
-    "$$\n",
-    "    v(x)\n",
-    "    = \\max_{a \\in \\Gamma(x)}\n",
-    "    \\left(\n",
-    "        -\\frac{1}{\\gamma}\n",
-    "    \\right)\n",
-    "    \\ln\n",
-    "    \\left\\{\n",
-    "        \\sum_d \\phi(d) \\exp\n",
-    "        \\left[\n",
-    "            -\\gamma \\left( \\pi(x, a, d) + \\beta \\, v(h(x, a, d)) \\right)\n",
-    "        \\right]\n",
-    "    \\right\\}.\n",
-    "$$\n",
-    "\n",
-    "Here $\\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.\n",
-    "\n",
-    "The parameter $\\gamma$ controls the degree of risk sensitivity.\n",
-    "\n",
-    "As $\\gamma \\to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.\n",
-    "\n",
-    "Larger $\\gamma$ means more aversion to downside risk.\n",
-    "\n",
-    "The Bellman operator, greedy policy, and VFI algorithm all carry over from the\n",
-    "risk-neutral case, with the expectation replaced by the certainty equivalent.\n",
-    "\n",
-    "\n",
-    "\n",
-    "## Solving via Value Function Iteration\n",
-    "\n",
-    "### Model specification\n",
-    "\n",
-    "We reuse the same model primitives as in {ref}`inventory_q`, adding $\\gamma$ as a parameter."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e7909494",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class RSModel(NamedTuple):\n",
-    "    x_values: np.ndarray        # Inventory values\n",
-    "    d_values: np.ndarray        # Demand values for summation\n",
-    "    ϕ_values: np.ndarray        # Demand probabilities\n",
-    "    p: float                    # Demand parameter\n",
-    "    c: float                    # Unit cost\n",
-    "    κ: float                    # Fixed cost\n",
-    "    β: float                    # Discount factor\n",
-    "    γ: float                    # Risk-sensitivity parameter"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c8cf486f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_rs_inventory_model(\n",
-    "        K: int = 20,         # Max inventory\n",
-    "        D_MAX: int = 21,     # Demand upper bound for summation\n",
-    "        p: float = 0.7,\n",
-    "        c: float = 0.2,\n",
-    "        κ: float = 0.8,\n",
-    "        β: float = 0.98,\n",
-    "        γ: float = 1.0\n",
-    "    ) -> RSModel:\n",
-    "\n",
-    "    def demand_pdf(p, d):\n",
-    "        return (1 - p)**d * p\n",
-    "\n",
-    "    d_values = np.arange(D_MAX)\n",
-    "    ϕ_values = demand_pdf(p, d_values)\n",
-    "    x_values = np.arange(K + 1)\n",
-    "\n",
-    "    return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "060e0b9c",
-   "metadata": {},
-   "source": [
-    "### The Bellman operator\n",
-    "\n",
-    "The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.\n",
-    "\n",
-    "For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \\pi(x, a, d_i) + \\beta \\, v(h(x, a, d_i))$, we compute\n",
-    "\n",
-    "$$\n",
-    "    -\\frac{1}{\\gamma} \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i)\n",
-    "    \\;=\\;\n",
-    "    -\\frac{1}{\\gamma}\n",
-    "    \\left(\n",
-    "        m + \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i - m)\n",
-    "    \\right),\n",
-    "$$\n",
-    "\n",
-    "where $m = \\max_i (-\\gamma z_i)$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8ecff396",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@numba.jit(nopython=True)\n",
-    "def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n",
-    "    new_v = np.empty(K + 1)\n",
-    "    n_d = len(d_values)\n",
-    "    for x in range(K + 1):\n",
-    "        best = -np.inf\n",
-    "        for a in range(K - x + 1):\n",
-    "            # Compute -γ * z_i for each demand realization\n",
-    "            exponents = np.empty(n_d)\n",
-    "            for i in range(n_d):\n",
-    "                d = d_values[i]\n",
-    "                x_next = max(x - d, 0) + a\n",
-    "                revenue = min(x, d)\n",
-    "                cost = c * a + κ * (a > 0)\n",
-    "                z_i = revenue - cost + β * v[x_next]\n",
-    "                exponents[i] = -γ * z_i\n",
-    "            # Log-sum-exp trick for numerical stability\n",
-    "            m = np.max(exponents)\n",
-    "            weighted_sum = 0.0\n",
-    "            for i in range(n_d):\n",
-    "                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n",
-    "            val = -(1.0 / γ) * (m + np.log(weighted_sum))\n",
-    "            if val > best:\n",
-    "                best = val\n",
-    "        new_v[x] = best\n",
-    "    return new_v"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7fdd1618",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def T_rs(v, model):\n",
-    "    \"\"\"The risk-sensitive Bellman operator.\"\"\"\n",
-    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
-    "    K = len(x_values) - 1\n",
-    "    return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "03fbea31",
-   "metadata": {},
-   "source": [
-    "### Computing the greedy policy\n",
-    "\n",
-    "The greedy policy records the maximizing action instead of the maximized value."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7eeb7faf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@numba.jit(nopython=True)\n",
-    "def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n",
-    "    σ = np.empty(K + 1, dtype=np.int32)\n",
-    "    n_d = len(d_values)\n",
-    "    for x in range(K + 1):\n",
-    "        best = -np.inf\n",
-    "        best_a = 0\n",
-    "        for a in range(K - x + 1):\n",
-    "            exponents = np.empty(n_d)\n",
-    "            for i in range(n_d):\n",
-    "                d = d_values[i]\n",
-    "                x_next = max(x - d, 0) + a\n",
-    "                revenue = min(x, d)\n",
-    "                cost = c * a + κ * (a > 0)\n",
-    "                z_i = revenue - cost + β * v[x_next]\n",
-    "                exponents[i] = -γ * z_i\n",
-    "            m = np.max(exponents)\n",
-    "            weighted_sum = 0.0\n",
-    "            for i in range(n_d):\n",
-    "                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n",
-    "            val = -(1.0 / γ) * (m + np.log(weighted_sum))\n",
-    "            if val > best:\n",
-    "                best = val\n",
-    "                best_a = a\n",
-    "        σ[x] = best_a\n",
-    "    return σ"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c5473245",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_greedy_rs(v, model):\n",
-    "    \"\"\"Get a v-greedy policy for the risk-sensitive model.\"\"\"\n",
-    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
-    "    K = len(x_values) - 1\n",
-    "    return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a6a3b8be",
-   "metadata": {},
-   "source": [
-    "### Value function iteration"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7ea93ea8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):\n",
-    "    v = v_init.copy()\n",
-    "    i, error = 0, tol + 1\n",
-    "\n",
-    "    while i < max_iter and error > tol:\n",
-    "        new_v = T_rs(v, model)\n",
-    "        error = np.max(np.abs(new_v - v))\n",
-    "        i += 1\n",
-    "        v = new_v\n",
-    "\n",
-    "    print(f\"Converged in {i} iterations with error {error:.2e}\")\n",
-    "\n",
-    "    σ = get_greedy_rs(v, model)\n",
-    "    return v, σ"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "962b1a57",
-   "metadata": {},
-   "source": [
-    "### Creating and solving an instance"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f6fe6712",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = create_rs_inventory_model()\n",
-    "x_values = model.x_values\n",
-    "n_x = len(x_values)\n",
-    "v_init = np.zeros(n_x)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3dc85cec",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "v_star, σ_star = solve_rs_inventory_model(v_init, model)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "24295d26",
-   "metadata": {},
-   "source": [
-    "### Effect of risk sensitivity on the optimal policy\n",
-    "\n",
-    "We solve the model for several values of $\\gamma$ and compare the resulting policies.\n",
-    "\n",
-    "As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c599f91f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "γ_values = [0.01, 1.0, 2.0]\n",
-    "results = {}\n",
-    "\n",
-    "for γ in γ_values:\n",
-    "    mod = create_rs_inventory_model(γ=γ)\n",
-    "    v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)\n",
-    "    results[γ] = (v, σ)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bfb69b17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n",
-    "\n",
-    "for γ in γ_values:\n",
-    "    v, σ = results[γ]\n",
-    "    axes[0].plot(x_values, v, label=f\"$\\\\gamma = {γ}$\")\n",
-    "    axes[1].plot(x_values, σ, label=f\"$\\\\gamma = {γ}$\")\n",
-    "\n",
-    "axes[0].set_xlabel(\"inventory\")\n",
-    "axes[0].set_ylabel(\"value\")\n",
-    "axes[0].legend()\n",
-    "axes[0].set_title(\"Value function\")\n",
-    "\n",
-    "axes[1].set_xlabel(\"inventory\")\n",
-    "axes[1].set_ylabel(\"order quantity\")\n",
-    "axes[1].legend()\n",
-    "axes[1].set_title(\"Policy\")\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "55ed320c",
-   "metadata": {},
-   "source": [
-    "### Simulating the optimal policy\n",
-    "\n",
-    "We simulate inventory dynamics under the optimal policy for the baseline $\\gamma$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7bfdd38c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@numba.jit(nopython=True)\n",
-    "def sim_inventories(ts_length, σ, p, X_init=0):\n",
-    "    \"\"\"Simulate inventory dynamics under policy σ.\"\"\"\n",
-    "    X = np.zeros(ts_length, dtype=np.int32)\n",
-    "    X[0] = X_init\n",
-    "    for t in range(ts_length - 1):\n",
-    "        d = np.random.geometric(p) - 1\n",
-    "        X[t+1] = max(X[t] - d, 0) + σ[X[t]]\n",
-    "    return X"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "993bf7d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axes = plt.subplots(len(γ_values), 1,\n",
-    "                         figsize=(8, 2.0 * len(γ_values)),\n",
-    "                         sharex=True)\n",
-    "\n",
-    "ts_length = 200\n",
-    "sim_seed = 5678\n",
-    "K = len(x_values) - 1\n",
-    "\n",
-    "for i, γ in enumerate(γ_values):\n",
-    "    v, σ = results[γ]\n",
-    "    np.random.seed(sim_seed)\n",
-    "    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)\n",
-    "    axes[i].plot(X, alpha=0.7)\n",
-    "    axes[i].set_ylabel(\"inventory\")\n",
-    "    axes[i].set_title(f\"$\\\\gamma = {γ}$\")\n",
-    "    axes[i].set_ylim(0, K + 2)\n",
-    "\n",
-    "axes[-1].set_xlabel(r\"$t$\")\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f33d501a",
-   "metadata": {},
-   "source": [
-    "## Interpreting the Outcomes\n",
-    "\n",
-    "The plots above show that a more risk-sensitive firm (larger $\\gamma$) orders\n",
-    "less inventory and maintains lower stock levels.\n",
-    "\n",
-    "At first glance this may seem surprising: wouldn't holding more inventory\n",
-    "reduce variance by ensuring the firm can always meet demand?\n",
-    "\n",
-    "The key is to identify where the randomness in profits actually comes from.\n",
-    "\n",
-    "Recall that per-period profit is $\\pi(x, a, d) = \\min(x, d) - ca - \\kappa\n",
-    "\\mathbf{1}\\{a > 0\\}$.\n",
-    "\n",
-    "The ordering cost $ca + \\kappa \\mathbf{1}\\{a > 0\\}$ is **deterministic** — it\n",
-    "is chosen before the demand shock is realized.\n",
-    "\n",
-    "So higher ordering shifts the level of profits down but does not affect their\n",
-    "variance.\n",
-    "\n",
-    "The variance comes from **revenue**: $\\min(x, D)$.\n",
-    "\n",
-    "When inventory $x$ is high, $\\min(x, D) \\approx D$ for most demand\n",
-    "realizations — revenue inherits the full variance of demand.\n",
-    "\n",
-    "When inventory $x$ is low, $\\min(x, D) \\approx x$ for most realizations —\n",
-    "revenue is nearly deterministic, capped at the inventory level.\n",
-    "\n",
-    "A risk-sensitive agent therefore prefers lower inventory because it **caps the\n",
-    "randomness of revenue**.\n",
-    "\n",
-    "The agent accepts lower expected sales in exchange for more predictable profits.\n",
-    "\n",
-    "There is also a continuation value channel: next-period inventory $\\max(x - D,\n",
-    "0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more\n",
-    "closely, propagating that variance forward through $v$.\n",
-    "\n",
-    "\n",
-    "## Q-Learning\n",
-    "\n",
-    "We now ask whether the optimal policy can be learned without knowledge of the\n",
-    "model, as we did in the risk-neutral case in {ref}`inventory_q`.\n",
-    "\n",
-    "### The Q-factor\n",
-    "\n",
-    "The first step is to define the Q-factor in a way that is compatible with the\n",
-    "risk-sensitive Bellman equation.\n",
-    "\n",
-    "We define\n",
-    "\n",
-    "$$\n",
-    "    q(x, a) := \\mathbb E\n",
-    "    \\left[\n",
-    "        \\exp\\!\\left(\n",
-    "            -\\gamma \\left( \\pi(x, a, D) + \\beta \\, v^*(h(x, a, D)) \\right)\n",
-    "        \\right)\n",
-    "    \\right].\n",
-    "$$\n",
-    "\n",
-    "In words, $q(x, a)$ applies the risk-sensitivity transformation $\\phi(t) =\n",
-    "\\exp(-\\gamma t)$ inside the expectation, evaluated at the return from taking\n",
-    "action $a$ in state $x$ and following the optimal policy thereafter.\n",
-    "\n",
-    "### Deriving the Q-factor Bellman equation\n",
-    "\n",
-    "Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.\n",
-    "\n",
-    "**Step 1.** Express $v^*$ in terms of $q$.\n",
-    "\n",
-    "The risk-sensitive Bellman equation says $v^*(x) = \\max_{a \\in \\Gamma(x)}\n",
-    "\\phi^{-1}(q(x, a))$.\n",
-    "\n",
-    "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$ is a **decreasing** function,\n",
-    "the maximum over $a$ of $\\phi^{-1}(q(x, a))$ corresponds to the **minimum**\n",
-    "over $a$ of $q(x, a)$:\n",
-    "\n",
-    "$$\n",
-    "    v^*(x)\n",
-    "    = \\phi^{-1}\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right)\n",
-    "    = -\\frac{1}{\\gamma} \\ln\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right).\n",
-    "$$\n",
-    "\n",
-    "Equivalently,\n",
-    "\n",
-    "$$\n",
-    "    \\exp(-\\gamma \\, v^*(x)) = \\min_{a \\in \\Gamma(x)} q(x, a).\n",
-    "$$\n",
-    "\n",
-    "**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.\n",
-    "\n",
-    "Expanding the exponential in the definition of $q$,\n",
-    "\n",
-    "$$\n",
-    "    q(x, a)\n",
-    "    = \\mathbb E\n",
-    "    \\left[\n",
-    "        \\exp(-\\gamma \\, \\pi(x, a, D))\n",
-    "        \\;\\cdot\\;\n",
-    "        \\exp\\!\\left(-\\gamma \\beta \\, v^*(x')\\right)\n",
-    "    \\right]\n",
-    "$$\n",
-    "\n",
-    "where $x' = h(x, a, D)$.\n",
-    "\n",
-    "From Step 1, $\\exp(-\\gamma \\, v^*(x')) = \\min_{a' \\in \\Gamma(x')} q(x', a')$,\n",
-    "so $\\exp(-\\gamma \\beta \\, v^*(x')) = \\left[\\min_{a' \\in \\Gamma(x')} q(x',\n",
-    "a')\\right]^\\beta$.\n",
-    "\n",
-    "Substituting,\n",
-    "\n",
-    "$$\n",
-    "    q(x, a)\n",
-    "    = \\mathbb E\n",
-    "    \\left[\n",
-    "        \\exp(-\\gamma \\, \\pi(x, a, D))\n",
-    "        \\;\\cdot\\;\n",
-    "        \\left(\\min_{a' \\in \\Gamma(x')} q(x', a')\\right)^\\beta\n",
-    "    \\right].\n",
-    "$$\n",
-    "\n",
-    "This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.\n",
-    "\n",
-    "### The Q-learning update rule\n",
-    "\n",
-    "As in the risk-neutral case, we approximate the fixed point using stochastic\n",
-    "approximation.\n",
-    "\n",
-    "At each step, the agent is in state $x$, takes action $a$, observes profit\n",
-    "$R_{t+1} = \\pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,\n",
-    "and updates\n",
-    "\n",
-    "$$\n",
-    "    q_{t+1}(x, a)\n",
-    "    = (1 - \\alpha_t) \\, q_t(x, a)\n",
-    "    + \\alpha_t\n",
-    "    \\left[\n",
-    "        \\exp(-\\gamma \\, R_{t+1})\n",
-    "        \\;\\cdot\\;\n",
-    "        \\left(\\min_{a' \\in \\Gamma(X_{t+1})} q_t(X_{t+1}, a')\\right)^\\beta\n",
-    "    \\right].\n",
-    "$$\n",
-    "\n",
-    "The term in brackets is a single-sample estimate of the right-hand side of the\n",
-    "Q-factor Bellman equation.\n",
-    "\n",
-    "The update blends the current estimate with this fresh sample, just as in\n",
-    "standard Q-learning.\n",
-    "\n",
-    "Notice several differences from the risk-neutral case:\n",
-    "\n",
-    "- The Q-values are **positive** (expectations of exponentials) rather than signed.\n",
-    "- The optimal policy is $\\sigma(x) = \\arg\\min_a q(x, a)$ — we **minimize**\n",
-    "  rather than maximize, because $\\phi^{-1}$ is decreasing.\n",
-    "- The observed profit enters through $\\exp(-\\gamma R_{t+1})$ rather than\n",
-    "  additively.\n",
-    "- The continuation value enters as a **power** $(\\min_{a'} q_t)^\\beta$ rather\n",
-    "  than a scaled sum $\\beta \\cdot \\max_{a'} q_t$.\n",
-    "\n",
-    "As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and\n",
-    "$X_{t+1}$ — no model knowledge is required.\n",
-    "\n",
-    "### Implementation plan\n",
-    "\n",
-    "Our implementation follows the same structure as the risk-neutral Q-learning in\n",
-    "{ref}`inventory_q`, with the modifications above:\n",
-    "\n",
-    "1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and\n",
-    "   visit counts $n$ to zeros.\n",
-    "2. **At each step:**\n",
-    "   - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state\n",
-    "     $X_{t+1}$.\n",
-    "   - Compute $\\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a\n",
-    "     scalar for the update target, and the $\\arg\\min$ action is used by the\n",
-    "     $\\varepsilon$-greedy behavior policy).\n",
-    "   - Update $q_t(x, a)$ using the rule above, with learning rate\n",
-    "     $\\alpha_t = 1 / n_t(x, a)^{0.51}$.\n",
-    "   - Choose the next action via $\\varepsilon$-greedy: with probability\n",
-    "     $\\varepsilon$ pick a random feasible action, otherwise pick the\n",
-    "     $\\arg\\min$ action.\n",
-    "   - Decay $\\varepsilon$.\n",
-    "3. **Extract the greedy policy** from the final Q-table via\n",
-    "   $\\sigma(x) = \\arg\\min_{a \\in \\Gamma(x)} q(x, a)$.\n",
-    "4. **Compare** the learned policy against the VFI solution.\n",
-    "\n",
-    "### Implementation\n",
-    "\n",
-    "We first define a helper to extract the greedy policy from the Q-table.\n",
-    "\n",
-    "Since the optimal policy minimizes $q$, we use $\\arg\\min$ rather than $\\arg\\max$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed938977",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@numba.jit(nopython=True)\n",
-    "def greedy_policy_from_q_rs(q, K):\n",
-    "    \"\"\"Extract greedy policy from risk-sensitive Q-table (argmin).\"\"\"\n",
-    "    σ = np.empty(K + 1, dtype=np.int32)\n",
-    "    for x in range(K + 1):\n",
-    "        best_val = np.inf\n",
-    "        best_a = 0\n",
-    "        for a in range(K - x + 1):\n",
-    "            if q[x, a] < best_val:\n",
-    "                best_val = q[x, a]\n",
-    "                best_a = a\n",
-    "        σ[x] = best_a\n",
-    "    return σ"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "db9043ac",
-   "metadata": {},
-   "source": [
-    "The Q-learning loop mirrors the risk-neutral version, with the key changes:\n",
-    "Q-table initialized to ones, the update target uses $\\exp(-\\gamma R_{t+1})\n",
-    "\\cdot (\\min_{a'} q_t)^\\beta$, and the behavior policy follows the $\\arg\\min$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "73fcd434",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@numba.jit(nopython=True)\n",
-    "def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n",
-    "                         ε_init, ε_min, ε_decay, snapshot_steps):\n",
-    "    q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1\n",
-    "    n = np.zeros((K + 1, K + 1))        # visit counts for learning rate\n",
-    "    ε = ε_init\n",
-    "\n",
-    "    n_snaps = len(snapshot_steps)\n",
-    "    snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)\n",
-    "    snap_idx = 0\n",
-    "\n",
-    "    # Initialize state and action\n",
-    "    x = X_init\n",
-    "    a = np.random.randint(0, K - x + 1)\n",
-    "\n",
-    "    for t in range(n_steps):\n",
-    "        # Record policy snapshot if needed\n",
-    "        if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:\n",
-    "            snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)\n",
-    "            snap_idx += 1\n",
-    "\n",
-    "        # === Draw D_{t+1} and observe outcome ===\n",
-    "        d = np.random.geometric(p) - 1\n",
-    "        reward = min(x, d) - c * a - κ * (a > 0)\n",
-    "        x_next = max(x - d, 0) + a\n",
-    "\n",
-    "        # === Min over next state (scalar value for update target) ===\n",
-    "        # Also record the argmin action for use by the behavior policy.\n",
-    "        best_next = np.inf\n",
-    "        a_next = 0\n",
-    "        for aa in range(K - x_next + 1):\n",
-    "            if q[x_next, aa] < best_next:\n",
-    "                best_next = q[x_next, aa]\n",
-    "                a_next = aa\n",
-    "\n",
-    "        # === Risk-sensitive Q-learning update ===\n",
-    "        target = np.exp(-γ * reward) * best_next ** β\n",
-    "        n[x, a] += 1\n",
-    "        α = 1.0 / n[x, a] ** 0.51\n",
-    "        q[x, a] = (1 - α) * q[x, a] + α * target\n",
-    "\n",
-    "        # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===\n",
-    "        x = x_next\n",
-    "        if np.random.random() < ε:\n",
-    "            a = np.random.randint(0, K - x + 1)\n",
-    "        else:\n",
-    "            a = a_next\n",
-    "        ε = max(ε_min, ε * ε_decay)\n",
-    "\n",
-    "    return q, snapshots"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "72492777",
-   "metadata": {},
-   "source": [
-    "The wrapper function unpacks the model and provides default hyperparameters."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8cd9e8c5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def q_learning_rs(model, n_steps=20_000_000, X_init=0,\n",
-    "                  ε_init=1.0, ε_min=0.01, ε_decay=0.999999,\n",
-    "                  snapshot_steps=None):\n",
-    "    x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n",
-    "    K = len(x_values) - 1\n",
-    "    if snapshot_steps is None:\n",
-    "        snapshot_steps = np.array([], dtype=np.int64)\n",
-    "    return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n",
-    "                                ε_init, ε_min, ε_decay, snapshot_steps)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9b095eef",
-   "metadata": {},
-   "source": [
-    "### Running Q-learning\n",
-    "\n",
-    "We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "38386ce1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.random.seed(1234)\n",
-    "snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)\n",
-    "q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "985c208b",
-   "metadata": {},
-   "source": [
-    "### Comparing with the exact solution\n",
-    "\n",
-    "We extract the value function and policy from the final Q-table.\n",
-    "\n",
-    "Since Q-values represent $\\mathbb{E}[\\exp(-\\gamma(\\cdots))]$, we recover the\n",
-    "value function via $v_Q(x) = -\\frac{1}{\\gamma} \\ln(\\min_{a} q(x, a))$ and the\n",
-    "policy via $\\sigma_Q(x) = \\arg\\min_a q(x, a)$."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dccdb8f4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "K = len(x_values) - 1\n",
-    "γ_base = model.γ\n",
-    "# restrict to feasible actions a ∈ {0, ..., K-x}\n",
-    "v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))\n",
-    "                for x in range(K + 1)])\n",
-    "σ_q = np.array([np.argmin(q_table[x, :K - x + 1])\n",
-    "                for x in range(K + 1)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8a855f91",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n",
-    "\n",
-    "axes[0].plot(x_values, v_star, label=\"VFI\")\n",
-    "axes[0].plot(x_values, v_q, '--', label=\"Q-learning\")\n",
-    "axes[0].set_xlabel(\"inventory\")\n",
-    "axes[0].set_ylabel(\"value\")\n",
-    "axes[0].legend()\n",
-    "axes[0].set_title(\"Value function\")\n",
-    "\n",
-    "axes[1].plot(x_values, σ_star, label=\"VFI\")\n",
-    "axes[1].plot(x_values, σ_q, '--', label=\"Q-learning\")\n",
-    "axes[1].set_xlabel(\"inventory\")\n",
-    "axes[1].set_ylabel(\"order quantity\")\n",
-    "axes[1].legend()\n",
-    "axes[1].set_title(\"Policy\")\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8579f07d",
-   "metadata": {},
-   "source": [
-    "### Visualizing learning over time\n",
-    "\n",
-    "The panels below show how the agent's policy evolves during training.\n",
-    "\n",
-    "Each panel simulates an inventory path using the greedy policy extracted from\n",
-    "the Q-table at a given training step, with the same demand sequence throughout.\n",
-    "\n",
-    "The top panel shows the optimal policy from VFI for reference."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "142d3906",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ts_length = 200\n",
-    "n_snaps = len(snap_steps)\n",
-    "fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),\n",
-    "                         sharex=True)\n",
-    "\n",
-    "X_init = K // 2\n",
-    "sim_seed = 5678\n",
-    "\n",
-    "# Optimal policy\n",
-    "np.random.seed(sim_seed)\n",
-    "X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)\n",
-    "axes[0].plot(X_opt, alpha=0.7)\n",
-    "axes[0].set_ylabel(\"inventory\")\n",
-    "axes[0].set_title(\"Optimal (VFI)\")\n",
-    "axes[0].set_ylim(0, K + 2)\n",
-    "\n",
-    "# Q-learning snapshots\n",
-    "for i in range(n_snaps):\n",
-    "    σ_snap = snapshots[i]\n",
-    "    np.random.seed(sim_seed)\n",
-    "    X = sim_inventories(ts_length, σ_snap, model.p, X_init)\n",
-    "    axes[i + 1].plot(X, alpha=0.7)\n",
-    "    axes[i + 1].set_ylabel(\"inventory\")\n",
-    "    axes[i + 1].set_title(f\"Step {snap_steps[i]:,}\")\n",
-    "    axes[i + 1].set_ylim(0, K + 2)\n",
-    "\n",
-    "axes[-1].set_xlabel(r\"$t$\")\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "274e77d9",
-   "metadata": {},
-   "source": [
-    "After 10,000 steps the agent has barely explored and its policy is erratic.\n",
-    "\n",
-    "By 1,000,000 steps the learned policy begins to resemble the optimal one, and\n",
-    "by step 20 million the inventory dynamics are nearly indistinguishable from the\n",
-    "VFI solution.\n",
-    "\n",
-    "Note that the converged policy maintains lower inventory levels than in the\n",
-    "risk-neutral case (compare with {ref}`inventory_q`), consistent with the\n",
-    "mechanism discussed above: a risk-sensitive agent caps its exposure to demand\n",
-    "variance by holding less stock.\n",
-    "\n",
-    "## Conclusion\n",
-    "\n",
-    "We extended the inventory management problem from {ref}`inventory_q` to\n",
-    "incorporate risk sensitivity via the certainty equivalent operator\n",
-    "$\\phi^{-1}(\\mathbb{E}[\\phi(\\cdot)])$ with $\\phi(t) = \\exp(-\\gamma t)$.\n",
-    "\n",
-    "Value function iteration confirms that risk-sensitive firms order less\n",
-    "aggressively, preferring predictable profits over higher but more volatile\n",
-    "returns.\n",
-    "\n",
-    "We then showed that Q-learning can be adapted to the risk-sensitive setting by\n",
-    "working with the transformed Q-factor $q(x,a) =\n",
-    "\\mathbb{E}[\\exp(-\\gamma(\\pi + \\beta v^*))]$.\n",
-    "\n",
-    "The resulting update rule replaces addition with multiplication and max with\n",
-    "min, but retains the key property of model-free learning: the agent needs only\n",
-    "to observe states, actions, and profits."
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "default_lexer": "ipython3"
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/lectures/rs_inventory_q.py b/lectures/rs_inventory_q.py
deleted file mode 100644
index 82a1ba74f..000000000
--- a/lectures/rs_inventory_q.py
+++ /dev/null
@@ -1,767 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     default_lexer: ipython3
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.18.1
-#   kernelspec:
-#     display_name: Python 3 (ipykernel)
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# (rs_inventory_q)=
-# ```{raw} jupyter
-# <div id="qe-notebook-header" align="right" style="text-align:right;">
-#         <a href="https://quantecon.org/" title="quantecon.org">
-#                 <img style="width:250px;display:inline;" width="250px" src="https://assets.quantecon.org/img/qe-menubar-logo.svg" alt="QuantEcon">
-#         </a>
-# </div>
-# ```
-#
-# # Risk-Sensitive Inventory Management via Q-Learning
-#
-# ```{contents} Contents
-# :depth: 2
-# ```
-#
-# ## Introduction
-#
-# In {ref}`inventory_q`, we looked at an inventory management
-# problem and solved it with both value function iteration and Q-learning.
-#
-# In this lecture, we consider a risk-sensitive variation.
-#
-# Injection of risk-sensitivity acknowledges the fact
-# that, in incomplete markets with financial and informational frictions, firms
-# typically take risk into account in their decision making.
-#
-# In other words, the actions of firms are not, in general, risk neutral.
-#
-# One natural way to handle this is to use a risk-sensitive version of the Bellman
-# equation.
-#
-# We show how the model can be solved using value function iteration.
-#
-# We then investigate how risk sensitivity affects the optimal policy.
-
-# %%
-import numpy as np
-import numba
-import matplotlib.pyplot as plt
-from typing import NamedTuple
-
-
-# %% [markdown]
-# ## The Model
-#
-# The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form
-#
-#
-# $$
-#     v(x)
-#     = \max_{a \in \Gamma(x)} \mathbb E
-#     \left[
-#         \pi(x, a, D)
-#         + \beta  v(h(x, a, D))
-#     \right].
-# $$
-#
-# Here $D$ is a random variable with distribution $\phi$.
-#
-# (Primitives and definitions are the same as in {ref}`inventory_q`.)
-#
-# The risk-sensitive version of this Bellman equation has the form
-#
-# $$
-#     v(x)
-#     = \max_{a \in \Gamma(x)}
-#     \phi^{-1}
-#     \left\{
-#         \mathbb E \phi
-#         \left[
-#             \pi(x, a, D)
-#             + \beta  v(h(x, a, D))
-#         \right]
-#     \right\},
-# $$
-#
-# where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$.
-#
-# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes
-#
-# $$
-#     v(x)
-#     = \max_{a \in \Gamma(x)}
-#     \left(
-#         -\frac{1}{\gamma}
-#     \right)
-#     \ln
-#     \left\{
-#         \sum_d \phi(d) \exp
-#         \left[
-#             -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right)
-#         \right]
-#     \right\}.
-# $$
-#
-# Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.
-#
-# The parameter $\gamma$ controls the degree of risk sensitivity.
-#
-# As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.
-#
-# Larger $\gamma$ means more aversion to downside risk.
-#
-# The Bellman operator, greedy policy, and VFI algorithm all carry over from the
-# risk-neutral case, with the expectation replaced by the certainty equivalent.
-#
-#
-#
-# ## Solving via Value Function Iteration
-#
-# ### Model specification
-#
-# We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter.
-
-# %%
-class RSModel(NamedTuple):
-    x_values: np.ndarray        # Inventory values
-    d_values: np.ndarray        # Demand values for summation
-    ϕ_values: np.ndarray        # Demand probabilities
-    p: float                    # Demand parameter
-    c: float                    # Unit cost
-    κ: float                    # Fixed cost
-    β: float                    # Discount factor
-    γ: float                    # Risk-sensitivity parameter
-
-
-# %%
-def create_rs_inventory_model(
-        K: int = 20,         # Max inventory
-        D_MAX: int = 21,     # Demand upper bound for summation
-        p: float = 0.7,
-        c: float = 0.2,
-        κ: float = 0.8,
-        β: float = 0.98,
-        γ: float = 1.0
-    ) -> RSModel:
-
-    def demand_pdf(p, d):
-        return (1 - p)**d * p
-
-    d_values = np.arange(D_MAX)
-    ϕ_values = demand_pdf(p, d_values)
-    x_values = np.arange(K + 1)
-
-    return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)
-
-
-# %% [markdown]
-# ### The Bellman operator
-#
-# The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.
-#
-# For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute
-#
-# $$
-#     -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i)
-#     \;=\;
-#     -\frac{1}{\gamma}
-#     \left(
-#         m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m)
-#     \right),
-# $$
-#
-# where $m = \max_i (-\gamma z_i)$.
-
-# %%
-@numba.jit(nopython=True)
-def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
-    new_v = np.empty(K + 1)
-    n_d = len(d_values)
-    for x in range(K + 1):
-        best = -np.inf
-        for a in range(K - x + 1):
-            # Compute -γ * z_i for each demand realization
-            exponents = np.empty(n_d)
-            for i in range(n_d):
-                d = d_values[i]
-                x_next = max(x - d, 0) + a
-                revenue = min(x, d)
-                cost = c * a + κ * (a > 0)
-                z_i = revenue - cost + β * v[x_next]
-                exponents[i] = -γ * z_i
-            # Log-sum-exp trick for numerical stability
-            m = np.max(exponents)
-            weighted_sum = 0.0
-            for i in range(n_d):
-                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
-            val = -(1.0 / γ) * (m + np.log(weighted_sum))
-            if val > best:
-                best = val
-        new_v[x] = best
-    return new_v
-
-
-# %%
-def T_rs(v, model):
-    """The risk-sensitive Bellman operator."""
-    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
-    K = len(x_values) - 1
-    return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
-
-
-# %% [markdown]
-# ### Computing the greedy policy
-#
-# The greedy policy records the maximizing action instead of the maximized value.
-
-# %%
-@numba.jit(nopython=True)
-def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):
-    σ = np.empty(K + 1, dtype=np.int32)
-    n_d = len(d_values)
-    for x in range(K + 1):
-        best = -np.inf
-        best_a = 0
-        for a in range(K - x + 1):
-            exponents = np.empty(n_d)
-            for i in range(n_d):
-                d = d_values[i]
-                x_next = max(x - d, 0) + a
-                revenue = min(x, d)
-                cost = c * a + κ * (a > 0)
-                z_i = revenue - cost + β * v[x_next]
-                exponents[i] = -γ * z_i
-            m = np.max(exponents)
-            weighted_sum = 0.0
-            for i in range(n_d):
-                weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)
-            val = -(1.0 / γ) * (m + np.log(weighted_sum))
-            if val > best:
-                best = val
-                best_a = a
-        σ[x] = best_a
-    return σ
-
-
-# %%
-def get_greedy_rs(v, model):
-    """Get a v-greedy policy for the risk-sensitive model."""
-    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
-    K = len(x_values) - 1
-    return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)
-
-
-# %% [markdown]
-# ### Value function iteration
-
-# %%
-def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):
-    v = v_init.copy()
-    i, error = 0, tol + 1
-
-    while i < max_iter and error > tol:
-        new_v = T_rs(v, model)
-        error = np.max(np.abs(new_v - v))
-        i += 1
-        v = new_v
-
-    print(f"Converged in {i} iterations with error {error:.2e}")
-
-    σ = get_greedy_rs(v, model)
-    return v, σ
-
-
-# %% [markdown]
-# ### Creating and solving an instance
-
-# %%
-model = create_rs_inventory_model()
-x_values = model.x_values
-n_x = len(x_values)
-v_init = np.zeros(n_x)
-
-# %%
-v_star, σ_star = solve_rs_inventory_model(v_init, model)
-
-# %% [markdown]
-# ### Effect of risk sensitivity on the optimal policy
-#
-# We solve the model for several values of $\gamma$ and compare the resulting policies.
-#
-# As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one.
-
-# %%
-γ_values = [0.01, 1.0, 2.0]
-results = {}
-
-for γ in γ_values:
-    mod = create_rs_inventory_model(γ=γ)
-    v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)
-    results[γ] = (v, σ)
-
-# %%
-fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
-
-for γ in γ_values:
-    v, σ = results[γ]
-    axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$")
-    axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$")
-
-axes[0].set_xlabel("inventory")
-axes[0].set_ylabel("value")
-axes[0].legend()
-axes[0].set_title("Value function")
-
-axes[1].set_xlabel("inventory")
-axes[1].set_ylabel("order quantity")
-axes[1].legend()
-axes[1].set_title("Policy")
-
-plt.tight_layout()
-plt.show()
-
-
-# %% [markdown]
-# ### Simulating the optimal policy
-#
-# We simulate inventory dynamics under the optimal policy for the baseline $\gamma$.
-
-# %%
-@numba.jit(nopython=True)
-def sim_inventories(ts_length, σ, p, X_init=0):
-    """Simulate inventory dynamics under policy σ."""
-    X = np.zeros(ts_length, dtype=np.int32)
-    X[0] = X_init
-    for t in range(ts_length - 1):
-        d = np.random.geometric(p) - 1
-        X[t+1] = max(X[t] - d, 0) + σ[X[t]]
-    return X
-
-
-# %%
-fig, axes = plt.subplots(len(γ_values), 1,
-                         figsize=(8, 2.0 * len(γ_values)),
-                         sharex=True)
-
-ts_length = 200
-sim_seed = 5678
-K = len(x_values) - 1
-
-for i, γ in enumerate(γ_values):
-    v, σ = results[γ]
-    np.random.seed(sim_seed)
-    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)
-    axes[i].plot(X, alpha=0.7)
-    axes[i].set_ylabel("inventory")
-    axes[i].set_title(f"$\\gamma = {γ}$")
-    axes[i].set_ylim(0, K + 2)
-
-axes[-1].set_xlabel(r"$t$")
-plt.tight_layout()
-plt.show()
-
-
-# %% [markdown]
-# ## Interpreting the Outcomes
-#
-# The plots above show that a more risk-sensitive firm (larger $\gamma$) orders
-# less inventory and maintains lower stock levels.
-#
-# At first glance this may seem surprising: wouldn't holding more inventory
-# reduce variance by ensuring the firm can always meet demand?
-#
-# The key is to identify where the randomness in profits actually comes from.
-#
-# Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa
-# \mathbf{1}\{a > 0\}$.
-#
-# The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it
-# is chosen before the demand shock is realized.
-#
-# So higher ordering shifts the level of profits down but does not affect their
-# variance.
-#
-# The variance comes from **revenue**: $\min(x, D)$.
-#
-# When inventory $x$ is high, $\min(x, D) \approx D$ for most demand
-# realizations — revenue inherits the full variance of demand.
-#
-# When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations —
-# revenue is nearly deterministic, capped at the inventory level.
-#
-# A risk-sensitive agent therefore prefers lower inventory because it **caps the
-# randomness of revenue**.
-#
-# The agent accepts lower expected sales in exchange for more predictable profits.
-#
-# There is also a continuation value channel: next-period inventory $\max(x - D,
-# 0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more
-# closely, propagating that variance forward through $v$.
-#
-#
-# ## Q-Learning
-#
-# We now ask whether the optimal policy can be learned without knowledge of the
-# model, as we did in the risk-neutral case in {ref}`inventory_q`.
-#
-# ### The Q-factor
-#
-# The first step is to define the Q-factor in a way that is compatible with the
-# risk-sensitive Bellman equation.
-#
-# We define
-#
-# $$
-#     q(x, a) := \mathbb E
-#     \left[
-#         \exp\!\left(
-#             -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right)
-#         \right)
-#     \right].
-# $$
-#
-# In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) =
-# \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking
-# action $a$ in state $x$ and following the optimal policy thereafter.
-#
-# ### Deriving the Q-factor Bellman equation
-#
-# Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.
-#
-# **Step 1.** Express $v^*$ in terms of $q$.
-#
-# The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)}
-# \phi^{-1}(q(x, a))$.
-#
-# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function,
-# the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum**
-# over $a$ of $q(x, a)$:
-#
-# $$
-#     v^*(x)
-#     = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right)
-#     = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right).
-# $$
-#
-# Equivalently,
-#
-# $$
-#     \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a).
-# $$
-#
-# **Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.
-#
-# Expanding the exponential in the definition of $q$,
-#
-# $$
-#     q(x, a)
-#     = \mathbb E
-#     \left[
-#         \exp(-\gamma \, \pi(x, a, D))
-#         \;\cdot\;
-#         \exp\!\left(-\gamma \beta \, v^*(x')\right)
-#     \right]
-# $$
-#
-# where $x' = h(x, a, D)$.
-#
-# From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$,
-# so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x',
-# a')\right]^\beta$.
-#
-# Substituting,
-#
-# $$
-#     q(x, a)
-#     = \mathbb E
-#     \left[
-#         \exp(-\gamma \, \pi(x, a, D))
-#         \;\cdot\;
-#         \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta
-#     \right].
-# $$
-#
-# This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.
-#
-# ### The Q-learning update rule
-#
-# As in the risk-neutral case, we approximate the fixed point using stochastic
-# approximation.
-#
-# At each step, the agent is in state $x$, takes action $a$, observes profit
-# $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,
-# and updates
-#
-# $$
-#     q_{t+1}(x, a)
-#     = (1 - \alpha_t) \, q_t(x, a)
-#     + \alpha_t
-#     \left[
-#         \exp(-\gamma \, R_{t+1})
-#         \;\cdot\;
-#         \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta
-#     \right].
-# $$
-#
-# The term in brackets is a single-sample estimate of the right-hand side of the
-# Q-factor Bellman equation.
-#
-# The update blends the current estimate with this fresh sample, just as in
-# standard Q-learning.
-#
-# Notice several differences from the risk-neutral case:
-#
-# - The Q-values are **positive** (expectations of exponentials) rather than signed.
-# - The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize**
-#   rather than maximize, because $\phi^{-1}$ is decreasing.
-# - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
-#   additively.
-# - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather
-#   than a scaled sum $\beta \cdot \max_{a'} q_t$.
-#
-# As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and
-# $X_{t+1}$ — no model knowledge is required.
-#
-# ### Implementation plan
-#
-# Our implementation follows the same structure as the risk-neutral Q-learning in
-# {ref}`inventory_q`, with the modifications above:
-#
-# 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and
-#    visit counts $n$ to zeros.
-# 2. **At each step:**
-#    - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state
-#      $X_{t+1}$.
-#    - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a
-#      scalar for the update target, and the $\arg\min$ action is used by the
-#      $\varepsilon$-greedy behavior policy).
-#    - Update $q_t(x, a)$ using the rule above, with learning rate
-#      $\alpha_t = 1 / n_t(x, a)^{0.51}$.
-#    - Choose the next action via $\varepsilon$-greedy: with probability
-#      $\varepsilon$ pick a random feasible action, otherwise pick the
-#      $\arg\min$ action.
-#    - Decay $\varepsilon$.
-# 3. **Extract the greedy policy** from the final Q-table via
-#    $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$.
-# 4. **Compare** the learned policy against the VFI solution.
-#
-# ### Implementation
-#
-# We first define a helper to extract the greedy policy from the Q-table.
-#
-# Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$.
-
-# %%
-@numba.jit(nopython=True)
-def greedy_policy_from_q_rs(q, K):
-    """Extract greedy policy from risk-sensitive Q-table (argmin)."""
-    σ = np.empty(K + 1, dtype=np.int32)
-    for x in range(K + 1):
-        best_val = np.inf
-        best_a = 0
-        for a in range(K - x + 1):
-            if q[x, a] < best_val:
-                best_val = q[x, a]
-                best_a = a
-        σ[x] = best_a
-    return σ
-
-
-# %% [markdown]
-# The Q-learning loop mirrors the risk-neutral version, with the key changes:
-# Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1})
-# \cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$.
-
-# %%
-@numba.jit(nopython=True)
-def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
-                         ε_init, ε_min, ε_decay, snapshot_steps):
-    q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1
-    n = np.zeros((K + 1, K + 1))        # visit counts for learning rate
-    ε = ε_init
-
-    n_snaps = len(snapshot_steps)
-    snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)
-    snap_idx = 0
-
-    # Initialize state and action
-    x = X_init
-    a = np.random.randint(0, K - x + 1)
-
-    for t in range(n_steps):
-        # Record policy snapshot if needed
-        if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:
-            snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)
-            snap_idx += 1
-
-        # === Draw D_{t+1} and observe outcome ===
-        d = np.random.geometric(p) - 1
-        reward = min(x, d) - c * a - κ * (a > 0)
-        x_next = max(x - d, 0) + a
-
-        # === Min over next state (scalar value for update target) ===
-        # Also record the argmin action for use by the behavior policy.
-        best_next = np.inf
-        a_next = 0
-        for aa in range(K - x_next + 1):
-            if q[x_next, aa] < best_next:
-                best_next = q[x_next, aa]
-                a_next = aa
-
-        # === Risk-sensitive Q-learning update ===
-        target = np.exp(-γ * reward) * best_next ** β
-        n[x, a] += 1
-        α = 1.0 / n[x, a] ** 0.51
-        q[x, a] = (1 - α) * q[x, a] + α * target
-
-        # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===
-        x = x_next
-        if np.random.random() < ε:
-            a = np.random.randint(0, K - x + 1)
-        else:
-            a = a_next
-        ε = max(ε_min, ε * ε_decay)
-
-    return q, snapshots
-
-
-# %% [markdown]
-# The wrapper function unpacks the model and provides default hyperparameters.
-
-# %%
-def q_learning_rs(model, n_steps=20_000_000, X_init=0,
-                  ε_init=1.0, ε_min=0.01, ε_decay=0.999999,
-                  snapshot_steps=None):
-    x_values, d_values, ϕ_values, p, c, κ, β, γ = model
-    K = len(x_values) - 1
-    if snapshot_steps is None:
-        snapshot_steps = np.array([], dtype=np.int64)
-    return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
-                                ε_init, ε_min, ε_decay, snapshot_steps)
-
-
-# %% [markdown]
-# ### Running Q-learning
-#
-# We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end.
-
-# %%
-np.random.seed(1234)
-snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)
-q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)
-
-# %% [markdown]
-# ### Comparing with the exact solution
-#
-# We extract the value function and policy from the final Q-table.
-#
-# Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the
-# value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the
-# policy via $\sigma_Q(x) = \arg\min_a q(x, a)$.
-
-# %%
-K = len(x_values) - 1
-γ_base = model.γ
-# restrict to feasible actions a ∈ {0, ..., K-x}
-v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))
-                for x in range(K + 1)])
-σ_q = np.array([np.argmin(q_table[x, :K - x + 1])
-                for x in range(K + 1)])
-
-# %%
-fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))
-
-axes[0].plot(x_values, v_star, label="VFI")
-axes[0].plot(x_values, v_q, '--', label="Q-learning")
-axes[0].set_xlabel("inventory")
-axes[0].set_ylabel("value")
-axes[0].legend()
-axes[0].set_title("Value function")
-
-axes[1].plot(x_values, σ_star, label="VFI")
-axes[1].plot(x_values, σ_q, '--', label="Q-learning")
-axes[1].set_xlabel("inventory")
-axes[1].set_ylabel("order quantity")
-axes[1].legend()
-axes[1].set_title("Policy")
-
-plt.tight_layout()
-plt.show()
-
-# %% [markdown]
-# ### Visualizing learning over time
-#
-# The panels below show how the agent's policy evolves during training.
-#
-# Each panel simulates an inventory path using the greedy policy extracted from
-# the Q-table at a given training step, with the same demand sequence throughout.
-#
-# The top panel shows the optimal policy from VFI for reference.
-
-# %%
-ts_length = 200
-n_snaps = len(snap_steps)
-fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),
-                         sharex=True)
-
-X_init = K // 2
-sim_seed = 5678
-
-# Optimal policy
-np.random.seed(sim_seed)
-X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)
-axes[0].plot(X_opt, alpha=0.7)
-axes[0].set_ylabel("inventory")
-axes[0].set_title("Optimal (VFI)")
-axes[0].set_ylim(0, K + 2)
-
-# Q-learning snapshots
-for i in range(n_snaps):
-    σ_snap = snapshots[i]
-    np.random.seed(sim_seed)
-    X = sim_inventories(ts_length, σ_snap, model.p, X_init)
-    axes[i + 1].plot(X, alpha=0.7)
-    axes[i + 1].set_ylabel("inventory")
-    axes[i + 1].set_title(f"Step {snap_steps[i]:,}")
-    axes[i + 1].set_ylim(0, K + 2)
-
-axes[-1].set_xlabel(r"$t$")
-plt.tight_layout()
-plt.show()
-
-# %% [markdown]
-# After 10,000 steps the agent has barely explored and its policy is erratic.
-#
-# By 1,000,000 steps the learned policy begins to resemble the optimal one, and
-# by step 20 million the inventory dynamics are nearly indistinguishable from the
-# VFI solution.
-#
-# Note that the converged policy maintains lower inventory levels than in the
-# risk-neutral case (compare with {ref}`inventory_q`), consistent with the
-# mechanism discussed above: a risk-sensitive agent caps its exposure to demand
-# variance by holding less stock.
-#
-# ## Conclusion
-#
-# We extended the inventory management problem from {ref}`inventory_q` to
-# incorporate risk sensitivity via the certainty equivalent operator
-# $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$.
-#
-# Value function iteration confirms that risk-sensitive firms order less
-# aggressively, preferring predictable profits over higher but more volatile
-# returns.
-#
-# We then showed that Q-learning can be adapted to the risk-sensitive setting by
-# working with the transformed Q-factor $q(x,a) =
-# \mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$.
-#
-# The resulting update rule replaces addition with multiplication and max with
-# min, but retains the key property of model-free learning: the agent needs only
-# to observe states, actions, and profits.

From 78a43a97ed70c94fab932f09b470004ddfc44af2 Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Mon, 9 Mar 2026 17:10:26 +1100
Subject: [PATCH 4/8] Fix cross-references: use {doc} instead of {ref} for
 lecture links

The {ref} directive requires a label placed before a heading to
resolve a title. The inventory_q label is before a raw block,
so {ref} fails. Using {doc} links to the document directly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/rs_inventory_q.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index 6a368656b..157495109 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -28,7 +28,7 @@ kernelspec:
 
 ## Introduction
 
-In {ref}`inventory_q`, we looked at an inventory management
+In {doc}`inventory_q`, we looked at an inventory management
 problem and solved it with both value function iteration and Q-learning.
 
 In this lecture, we consider a risk-sensitive variation.
@@ -58,7 +58,7 @@ from typing import NamedTuple
 
 ## The Model
 
-The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form
+The Bellman equation for the inventory management problem in {doc}`inventory_q` has the form
 
 
 $$
@@ -72,7 +72,7 @@ $$
 
 Here $D$ is a random variable with distribution $\phi$.
 
-(Primitives and definitions are the same as in {ref}`inventory_q`.)
+(Primitives and definitions are the same as in {doc}`inventory_q`.)
 
 The risk-sensitive version of this Bellman equation has the form
 
@@ -108,7 +108,7 @@ $$
     \right\}.
 $$
 
-Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.
+Here $\phi(d)$ denotes the demand probability mass function, as in {doc}`inventory_q`.
 
 The parameter $\gamma$ controls the degree of risk sensitivity.
 
@@ -125,7 +125,7 @@ risk-neutral case, with the expectation replaced by the certainty equivalent.
 
 ### Model specification
 
-We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter.
+We reuse the same model primitives as in {doc}`inventory_q`, adding $\gamma$ as a parameter.
 
 ```{code-cell} ipython3
 class RSModel(NamedTuple):
@@ -405,7 +405,7 @@ closely, propagating that variance forward through $v$.
 ## Q-Learning
 
 We now ask whether the optimal policy can be learned without knowledge of the
-model, as we did in the risk-neutral case in {ref}`inventory_q`.
+model, as we did in the risk-neutral case in {doc}`inventory_q`.
 
 ### The Q-factor
 
@@ -528,7 +528,7 @@ $X_{t+1}$ — no model knowledge is required.
 ### Implementation plan
 
 Our implementation follows the same structure as the risk-neutral Q-learning in
-{ref}`inventory_q`, with the modifications above:
+{doc}`inventory_q`, with the modifications above:
 
 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and
    visit counts $n$ to zeros.
@@ -738,13 +738,13 @@ by step 20 million the inventory dynamics are nearly indistinguishable from the
 VFI solution.
 
 Note that the converged policy maintains lower inventory levels than in the
-risk-neutral case (compare with {ref}`inventory_q`), consistent with the
+risk-neutral case (compare with {doc}`inventory_q`), consistent with the
 mechanism discussed above: a risk-sensitive agent caps its exposure to demand
 variance by holding less stock.
 
 ## Conclusion
 
-We extended the inventory management problem from {ref}`inventory_q` to
+We extended the inventory management problem from {doc}`inventory_q` to
 incorporate risk sensitivity via the certainty equivalent operator
 $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$.
 

From 29762cd1acdffd05c6e4d6a3a2c2c2c62fb9f104 Mon Sep 17 00:00:00 2001
From: Matt McKay <mmcky@users.noreply.github.com>
Date: Tue, 10 Mar 2026 08:59:16 +1100
Subject: [PATCH 5/8] Use \argmin and \argmax macros from _config.yml

---
 lectures/rs_inventory_q.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index 157495109..58e6b1b9d 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -515,7 +515,7 @@ standard Q-learning.
 Notice several differences from the risk-neutral case:
 
 - The Q-values are **positive** (expectations of exponentials) rather than signed.
-- The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize**
+- The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ — we **minimize**
   rather than maximize, because $\phi^{-1}$ is decreasing.
 - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
   additively.
@@ -536,23 +536,23 @@ Our implementation follows the same structure as the risk-neutral Q-learning in
    - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state
      $X_{t+1}$.
    - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a
-     scalar for the update target, and the $\arg\min$ action is used by the
+     scalar for the update target, and the $\argmin$ action is used by the
      $\varepsilon$-greedy behavior policy).
    - Update $q_t(x, a)$ using the rule above, with learning rate
      $\alpha_t = 1 / n_t(x, a)^{0.51}$.
    - Choose the next action via $\varepsilon$-greedy: with probability
      $\varepsilon$ pick a random feasible action, otherwise pick the
-     $\arg\min$ action.
+     $\argmin$ action.
    - Decay $\varepsilon$.
 3. **Extract the greedy policy** from the final Q-table via
-   $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$.
+   $\sigma(x) = \argmin_{a \in \Gamma(x)} q(x, a)$.
 4. **Compare** the learned policy against the VFI solution.
 
 ### Implementation
 
 We first define a helper to extract the greedy policy from the Q-table.
 
-Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$.
+Since the optimal policy minimizes $q$, we use $\argmin$ rather than $\argmax$.
 
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
@@ -572,7 +572,7 @@ def greedy_policy_from_q_rs(q, K):
 
 The Q-learning loop mirrors the risk-neutral version, with the key changes:
 Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1})
-\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$.
+\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\argmin$.
 
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
@@ -657,7 +657,7 @@ We extract the value function and policy from the final Q-table.
 
 Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the
 value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the
-policy via $\sigma_Q(x) = \arg\min_a q(x, a)$.
+policy via $\sigma_Q(x) = \argmin_a q(x, a)$.
 
 ```{code-cell} ipython3
 K = len(x_values) - 1

From 93ca801d9149340887bd94d38166ae72875bdc2b Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Tue, 10 Mar 2026 10:00:07 +1100
Subject: [PATCH 6/8] =?UTF-8?q?Rename=20risk-sensitivity=20function=20from?=
 =?UTF-8?q?=20=CF=86=20to=20=CF=88=20to=20avoid=20notation=20conflict?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The symbol φ was overloaded: used for both the demand PMF and the
risk-sensitivity transformation. Now ψ(t) = exp(-γt) is the risk
transformation and φ(d) remains the demand PMF, consistent with
inventory_q.md.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/rs_inventory_q.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index 58e6b1b9d..dd77bd16a 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -79,9 +79,9 @@ The risk-sensitive version of this Bellman equation has the form
 $$
     v(x)
     = \max_{a \in \Gamma(x)}
-    \phi^{-1}
+    \psi^{-1}
     \left\{
-        \mathbb E \phi
+        \mathbb E \psi
         \left[
             \pi(x, a, D)
             + \beta  v(h(x, a, D))
@@ -89,9 +89,9 @@ $$
     \right\},
 $$
 
-where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$.
+where $\psi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$.
 
-Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes
+Since $\psi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes
 
 $$
     v(x)
@@ -423,7 +423,7 @@ $$
     \right].
 $$
 
-In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) =
+In words, $q(x, a)$ applies the risk-sensitivity transformation $\psi(t) =
 \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking
 action $a$ in state $x$ and following the optimal policy thereafter.
 
@@ -434,15 +434,15 @@ Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.
 **Step 1.** Express $v^*$ in terms of $q$.
 
 The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)}
-\phi^{-1}(q(x, a))$.
+\psi^{-1}(q(x, a))$.
 
-Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function,
-the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum**
+Since $\psi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function,
+the maximum over $a$ of $\psi^{-1}(q(x, a))$ corresponds to the **minimum**
 over $a$ of $q(x, a)$:
 
 $$
     v^*(x)
-    = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right)
+    = \psi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right)
     = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right).
 $$
 
@@ -516,7 +516,7 @@ Notice several differences from the risk-neutral case:
 
 - The Q-values are **positive** (expectations of exponentials) rather than signed.
 - The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ — we **minimize**
-  rather than maximize, because $\phi^{-1}$ is decreasing.
+  rather than maximize, because $\psi^{-1}$ is decreasing.
 - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than
   additively.
 - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather
@@ -746,7 +746,7 @@ variance by holding less stock.
 
 We extended the inventory management problem from {doc}`inventory_q` to
 incorporate risk sensitivity via the certainty equivalent operator
-$\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$.
+$\psi^{-1}(\mathbb{E}[\psi(\cdot)])$ with $\psi(t) = \exp(-\gamma t)$.
 
 Value function iteration confirms that risk-sensitive firms order less
 aggressively, preferring predictable profits over higher but more volatile

From 1707fc6b2e4a68b0953a18b953e7c315275cf36b Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Tue, 10 Mar 2026 11:04:15 +1100
Subject: [PATCH 7/8] Add forward ref from inventory_q and fix minor grammar

- Add cross-reference from inventory_q.md to rs_inventory_q.md
- Fix missing comma after introductory phrase

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/inventory_q.md    | 2 ++
 lectures/rs_inventory_q.md | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md
index 39740eb6b..56c4cde2c 100644
--- a/lectures/inventory_q.md
+++ b/lectures/inventory_q.md
@@ -56,6 +56,8 @@ The lecture proceeds as follows:
 2. We introduce Q-factors and derive the Q-factor Bellman equation.
 3. We implement Q-learning and show the learned policy converges to the optimal one.
 
+A risk-sensitive extension of this model is studied in {doc}`rs_inventory_q`.
+
 We will use the following imports:
 
 ```{code-cell} ipython3
diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index dd77bd16a..da2f1d176 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -731,7 +731,7 @@ plt.tight_layout()
 plt.show()
 ```
 
-After 10,000 steps the agent has barely explored and its policy is erratic.
+After 10,000 steps, the agent has barely explored and its policy is erratic.
 
 By 1,000,000 steps the learned policy begins to resemble the optimal one, and
 by step 20 million the inventory dynamics are nearly indistinguishable from the

From fdd3a7186122f92198b6fa1b3c52702a601358f6 Mon Sep 17 00:00:00 2001
From: John Stachurski <john.stachurski@gmail.com>
Date: Tue, 10 Mar 2026 11:50:52 +1100
Subject: [PATCH 8/8] Move random seeds inside jitted functions for
 reproducibility

Numba JIT functions use their own RNG state, so np.random.seed()
called outside a jitted function has no effect on random draws
inside it. Moved seeds into sim_inventories and q_learning kernels
as parameters.

Fixes issue noted by @HumphreyYang.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 lectures/inventory_q.md    | 17 ++++++++---------
 lectures/rs_inventory_q.md | 20 +++++++++-----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md
index 56c4cde2c..146921bf8 100644
--- a/lectures/inventory_q.md
+++ b/lectures/inventory_q.md
@@ -348,8 +348,9 @@ At each step, we draw a demand shock from the geometric distribution and update
 
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
-def sim_inventories(ts_length, σ, p, X_init=0):
+def sim_inventories(ts_length, σ, p, X_init=0, seed=0):
     """Simulate inventory dynamics under policy σ."""
+    np.random.seed(seed)
     X = np.zeros(ts_length, dtype=np.int32)
     X[0] = X_init
     for t in range(ts_length - 1):
@@ -586,7 +587,8 @@ At specified step counts (given by `snapshot_steps`), we record the current gree
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
 def q_learning_kernel(K, p, c, κ, β, n_steps, X_init,
-                      ε_init, ε_min, ε_decay, snapshot_steps):
+                      ε_init, ε_min, ε_decay, snapshot_steps, seed):
+    np.random.seed(seed)
     q = np.zeros((K + 1, K + 1))
     n = np.zeros((K + 1, K + 1))       # visit counts for learning rate
     ε = ε_init
@@ -640,13 +642,13 @@ The wrapper function unpacks the model and provides default hyperparameters.
 ```{code-cell} ipython3
 def q_learning(model, n_steps=20_000_000, X_init=0,
                ε_init=1.0, ε_min=0.01, ε_decay=0.999999,
-               snapshot_steps=None):
+               snapshot_steps=None, seed=1234):
     x_values, d_values, ϕ_values, p, c, κ, β = model
     K = len(x_values) - 1
     if snapshot_steps is None:
         snapshot_steps = np.array([], dtype=np.int64)
     return q_learning_kernel(K, p, c, κ, β, n_steps, X_init,
-                             ε_init, ε_min, ε_decay, snapshot_steps)
+                             ε_init, ε_min, ε_decay, snapshot_steps, seed)
 ```
 
 ### Running Q-learning
@@ -654,7 +656,6 @@ def q_learning(model, n_steps=20_000_000, X_init=0,
 We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end.
 
 ```{code-cell} ipython3
-np.random.seed(1234)
 snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)
 q, snapshots = q_learning(model, snapshot_steps=snap_steps)
 ```
@@ -723,8 +724,7 @@ X_init = K // 2
 sim_seed = 5678
 
 # Optimal policy
-np.random.seed(sim_seed)
-X_opt = sim_inventories(ts_length, σ_star, p, X_init)
+X_opt = sim_inventories(ts_length, σ_star, p, X_init, seed=sim_seed)
 axes[0].plot(X_opt, alpha=0.7)
 axes[0].set_ylabel("inventory")
 axes[0].set_title("Optimal (VFI)")
@@ -733,8 +733,7 @@ axes[0].set_ylim(0, K + 2)
 # Q-learning snapshots
 for i in range(n_snaps):
     σ_snap = snapshots[i]
-    np.random.seed(sim_seed)
-    X = sim_inventories(ts_length, σ_snap, p, X_init)
+    X = sim_inventories(ts_length, σ_snap, p, X_init, seed=sim_seed)
     axes[i + 1].plot(X, alpha=0.7)
     axes[i + 1].set_ylabel("inventory")
     axes[i + 1].set_title(f"Step {snap_steps[i]:,}")
diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md
index da2f1d176..cd81e6bad 100644
--- a/lectures/rs_inventory_q.md
+++ b/lectures/rs_inventory_q.md
@@ -332,8 +332,9 @@ We simulate inventory dynamics under the optimal policy for the baseline $\gamma
 
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
-def sim_inventories(ts_length, σ, p, X_init=0):
+def sim_inventories(ts_length, σ, p, X_init=0, seed=0):
     """Simulate inventory dynamics under policy σ."""
+    np.random.seed(seed)
     X = np.zeros(ts_length, dtype=np.int32)
     X[0] = X_init
     for t in range(ts_length - 1):
@@ -353,8 +354,7 @@ K = len(x_values) - 1
 
 for i, γ in enumerate(γ_values):
     v, σ = results[γ]
-    np.random.seed(sim_seed)
-    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)
+    X = sim_inventories(ts_length, σ, model.p, X_init=K // 2, seed=sim_seed)
     axes[i].plot(X, alpha=0.7)
     axes[i].set_ylabel("inventory")
     axes[i].set_title(f"$\\gamma = {γ}$")
@@ -577,7 +577,8 @@ Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1})
 ```{code-cell} ipython3
 @numba.jit(nopython=True)
 def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
-                         ε_init, ε_min, ε_decay, snapshot_steps):
+                         ε_init, ε_min, ε_decay, snapshot_steps, seed):
+    np.random.seed(seed)
     q = np.ones((K + 1, K + 1))         # positive Q-values, initialized to 1
     n = np.zeros((K + 1, K + 1))        # visit counts for learning rate
     ε = ε_init
@@ -632,13 +633,13 @@ The wrapper function unpacks the model and provides default hyperparameters.
 ```{code-cell} ipython3
 def q_learning_rs(model, n_steps=20_000_000, X_init=0,
                   ε_init=1.0, ε_min=0.01, ε_decay=0.999999,
-                  snapshot_steps=None):
+                  snapshot_steps=None, seed=1234):
     x_values, d_values, ϕ_values, p, c, κ, β, γ = model
     K = len(x_values) - 1
     if snapshot_steps is None:
         snapshot_steps = np.array([], dtype=np.int64)
     return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,
-                                ε_init, ε_min, ε_decay, snapshot_steps)
+                                ε_init, ε_min, ε_decay, snapshot_steps, seed)
 ```
 
 ### Running Q-learning
@@ -646,7 +647,6 @@ def q_learning_rs(model, n_steps=20_000_000, X_init=0,
 We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end.
 
 ```{code-cell} ipython3
-np.random.seed(1234)
 snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)
 q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)
 ```
@@ -709,8 +709,7 @@ X_init = K // 2
 sim_seed = 5678
 
 # Optimal policy
-np.random.seed(sim_seed)
-X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)
+X_opt = sim_inventories(ts_length, σ_star, model.p, X_init, seed=sim_seed)
 axes[0].plot(X_opt, alpha=0.7)
 axes[0].set_ylabel("inventory")
 axes[0].set_title("Optimal (VFI)")
@@ -719,8 +718,7 @@ axes[0].set_ylim(0, K + 2)
 # Q-learning snapshots
 for i in range(n_snaps):
     σ_snap = snapshots[i]
-    np.random.seed(sim_seed)
-    X = sim_inventories(ts_length, σ_snap, model.p, X_init)
+    X = sim_inventories(ts_length, σ_snap, model.p, X_init, seed=sim_seed)
     axes[i + 1].plot(X, alpha=0.7)
     axes[i + 1].set_ylabel("inventory")
     axes[i + 1].set_title(f"Step {snap_steps[i]:,}")